1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libyuv/source/row_posix.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,6443 @@ 1.4 +/* 1.5 + * Copyright 2011 The LibYuv Project Authors. All rights reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include "libyuv/row.h" 1.15 + 1.16 +#ifdef __cplusplus 1.17 +namespace libyuv { 1.18 +extern "C" { 1.19 +#endif 1.20 + 1.21 +// This module is for GCC x86 and x64. 1.22 +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) 1.23 + 1.24 +#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) 1.25 + 1.26 +// Constants for ARGB 1.27 +static vec8 kARGBToY = { 1.28 + 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 1.29 +}; 1.30 + 1.31 +// JPeg full range. 1.32 +static vec8 kARGBToYJ = { 1.33 + 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 1.34 +}; 1.35 +#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) 1.36 + 1.37 +#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) 1.38 + 1.39 +static vec8 kARGBToU = { 1.40 + 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 1.41 +}; 1.42 + 1.43 +static vec8 kARGBToUJ = { 1.44 + 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 1.45 +}; 1.46 + 1.47 +static vec8 kARGBToV = { 1.48 + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 1.49 +}; 1.50 + 1.51 +static vec8 kARGBToVJ = { 1.52 + -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 1.53 +}; 1.54 + 1.55 +// Constants for BGRA 1.56 +static vec8 kBGRAToY = { 1.57 + 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 1.58 +}; 1.59 + 1.60 +static vec8 kBGRAToU = { 1.61 + 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 1.62 +}; 1.63 + 1.64 +static vec8 kBGRAToV = { 1.65 + 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 1.66 +}; 1.67 + 1.68 +// Constants for ABGR 1.69 +static vec8 kABGRToY = { 1.70 + 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 1.71 +}; 1.72 + 1.73 +static vec8 kABGRToU = { 1.74 + -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 1.75 +}; 1.76 + 1.77 +static vec8 kABGRToV = { 1.78 + 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 1.79 +}; 1.80 + 1.81 +// Constants for RGBA. 1.82 +static vec8 kRGBAToY = { 1.83 + 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 1.84 +}; 1.85 + 1.86 +static vec8 kRGBAToU = { 1.87 + 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 1.88 +}; 1.89 + 1.90 +static vec8 kRGBAToV = { 1.91 + 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 1.92 +}; 1.93 + 1.94 +static uvec8 kAddY16 = { 1.95 + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u 1.96 +}; 1.97 + 1.98 +static vec16 kAddYJ64 = { 1.99 + 64, 64, 64, 64, 64, 64, 64, 64 1.100 +}; 1.101 + 1.102 +static uvec8 kAddUV128 = { 1.103 + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 1.104 + 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 1.105 +}; 1.106 + 1.107 +static uvec16 kAddUVJ128 = { 1.108 + 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u 1.109 +}; 1.110 +#endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) 1.111 + 1.112 +#ifdef HAS_RGB24TOARGBROW_SSSE3 1.113 + 1.114 +// Shuffle table for converting RGB24 to ARGB. 1.115 +static uvec8 kShuffleMaskRGB24ToARGB = { 1.116 + 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 1.117 +}; 1.118 + 1.119 +// Shuffle table for converting RAW to ARGB. 1.120 +static uvec8 kShuffleMaskRAWToARGB = { 1.121 + 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 1.122 +}; 1.123 + 1.124 +// Shuffle table for converting ARGB to RGB24. 1.125 +static uvec8 kShuffleMaskARGBToRGB24 = { 1.126 + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u 1.127 +}; 1.128 + 1.129 +// Shuffle table for converting ARGB to RAW. 1.130 +static uvec8 kShuffleMaskARGBToRAW = { 1.131 + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u 1.132 +}; 1.133 + 1.134 +// Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 1.135 +static uvec8 kShuffleMaskARGBToRGB24_0 = { 1.136 + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u 1.137 +}; 1.138 + 1.139 +// Shuffle table for converting ARGB to RAW. 1.140 +static uvec8 kShuffleMaskARGBToRAW_0 = { 1.141 + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u 1.142 +}; 1.143 +#endif // HAS_RGB24TOARGBROW_SSSE3 1.144 + 1.145 +#if defined(TESTING) && defined(__x86_64__) 1.146 +void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 1.147 + asm volatile ( 1.148 + ".p2align 5 \n" 1.149 + "mov %%eax,%%eax \n" 1.150 + "mov %%ebx,%%ebx \n" 1.151 + "mov %%ecx,%%ecx \n" 1.152 + "mov %%edx,%%edx \n" 1.153 + "mov %%esi,%%esi \n" 1.154 + "mov %%edi,%%edi \n" 1.155 + "mov %%ebp,%%ebp \n" 1.156 + "mov %%esp,%%esp \n" 1.157 + ".p2align 5 \n" 1.158 + "mov %%r8d,%%r8d \n" 1.159 + "mov %%r9d,%%r9d \n" 1.160 + "mov %%r10d,%%r10d \n" 1.161 + "mov %%r11d,%%r11d \n" 1.162 + "mov %%r12d,%%r12d \n" 1.163 + "mov %%r13d,%%r13d \n" 1.164 + "mov %%r14d,%%r14d \n" 1.165 + "mov %%r15d,%%r15d \n" 1.166 + ".p2align 5 \n" 1.167 + "lea (%%rax),%%eax \n" 1.168 + "lea (%%rbx),%%ebx \n" 1.169 + "lea (%%rcx),%%ecx \n" 1.170 + "lea (%%rdx),%%edx \n" 1.171 + "lea (%%rsi),%%esi \n" 1.172 + "lea (%%rdi),%%edi \n" 1.173 + "lea (%%rbp),%%ebp \n" 1.174 + "lea (%%rsp),%%esp \n" 1.175 + ".p2align 5 \n" 1.176 + "lea (%%r8),%%r8d \n" 1.177 + "lea (%%r9),%%r9d \n" 1.178 + "lea (%%r10),%%r10d \n" 1.179 + "lea (%%r11),%%r11d \n" 1.180 + "lea (%%r12),%%r12d \n" 1.181 + "lea (%%r13),%%r13d \n" 1.182 + "lea (%%r14),%%r14d \n" 1.183 + "lea (%%r15),%%r15d \n" 1.184 + 1.185 + ".p2align 5 \n" 1.186 + "lea 0x10(%%rax),%%eax \n" 1.187 + "lea 0x10(%%rbx),%%ebx \n" 1.188 + "lea 0x10(%%rcx),%%ecx \n" 1.189 + "lea 0x10(%%rdx),%%edx \n" 1.190 + "lea 0x10(%%rsi),%%esi \n" 1.191 + "lea 0x10(%%rdi),%%edi \n" 1.192 + "lea 0x10(%%rbp),%%ebp \n" 1.193 + "lea 0x10(%%rsp),%%esp \n" 1.194 + ".p2align 5 \n" 1.195 + "lea 0x10(%%r8),%%r8d \n" 1.196 + "lea 0x10(%%r9),%%r9d \n" 1.197 + "lea 0x10(%%r10),%%r10d \n" 1.198 + "lea 0x10(%%r11),%%r11d \n" 1.199 + "lea 0x10(%%r12),%%r12d \n" 1.200 + "lea 0x10(%%r13),%%r13d \n" 1.201 + "lea 0x10(%%r14),%%r14d \n" 1.202 + "lea 0x10(%%r15),%%r15d \n" 1.203 + 1.204 + ".p2align 5 \n" 1.205 + "add 0x10,%%eax \n" 1.206 + "add 0x10,%%ebx \n" 1.207 + "add 0x10,%%ecx \n" 1.208 + "add 0x10,%%edx \n" 1.209 + "add 0x10,%%esi \n" 1.210 + "add 0x10,%%edi \n" 1.211 + "add 0x10,%%ebp \n" 1.212 + "add 0x10,%%esp \n" 1.213 + ".p2align 5 \n" 1.214 + "add 0x10,%%r8d \n" 1.215 + "add 0x10,%%r9d \n" 1.216 + "add 0x10,%%r10d \n" 1.217 + "add 0x10,%%r11d \n" 1.218 + "add 0x10,%%r12d \n" 1.219 + "add 0x10,%%r13d \n" 1.220 + "add 0x10,%%r14d \n" 1.221 + "add 0x10,%%r15d \n" 1.222 + 1.223 + ".p2align 2 \n" 1.224 + "1: \n" 1.225 + "movq " MEMACCESS(0) ",%%xmm0 \n" 1.226 + "lea " MEMLEA(0x8,0) ",%0 \n" 1.227 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.228 + "lea " MEMLEA(0x20,1) ",%1 \n" 1.229 + "sub $0x8,%2 \n" 1.230 + "jg 1b \n" 1.231 + : "+r"(src_y), // %0 1.232 + "+r"(dst_argb), // %1 1.233 + "+r"(pix) // %2 1.234 + : 1.235 + : "memory", "cc" 1.236 +#if defined(__SSE2__) 1.237 + , "xmm0", "xmm1", "xmm5" 1.238 +#endif 1.239 + ); 1.240 +} 1.241 +#endif // TESTING 1.242 + 1.243 +#ifdef HAS_I400TOARGBROW_SSE2 1.244 +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 1.245 + asm volatile ( 1.246 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.247 + "pslld $0x18,%%xmm5 \n" 1.248 + LABELALIGN 1.249 + "1: \n" 1.250 + "movq " MEMACCESS(0) ",%%xmm0 \n" 1.251 + "lea " MEMLEA(0x8,0) ",%0 \n" 1.252 + "punpcklbw %%xmm0,%%xmm0 \n" 1.253 + "movdqa %%xmm0,%%xmm1 \n" 1.254 + "punpcklwd %%xmm0,%%xmm0 \n" 1.255 + "punpckhwd %%xmm1,%%xmm1 \n" 1.256 + "por %%xmm5,%%xmm0 \n" 1.257 + "por %%xmm5,%%xmm1 \n" 1.258 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.259 + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" 1.260 + "lea " MEMLEA(0x20,1) ",%1 \n" 1.261 + "sub $0x8,%2 \n" 1.262 + "jg 1b \n" 1.263 + : "+r"(src_y), // %0 1.264 + "+r"(dst_argb), // %1 1.265 + "+r"(pix) // %2 1.266 + : 1.267 + : "memory", "cc" 1.268 +#if defined(__SSE2__) 1.269 + , "xmm0", "xmm1", "xmm5" 1.270 +#endif 1.271 + ); 1.272 +} 1.273 + 1.274 +void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, 1.275 + int pix) { 1.276 + asm volatile ( 1.277 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.278 + "pslld $0x18,%%xmm5 \n" 1.279 + LABELALIGN 1.280 + "1: \n" 1.281 + "movq " MEMACCESS(0) ",%%xmm0 \n" 1.282 + "lea " MEMLEA(0x8,0) ",%0 \n" 1.283 + "punpcklbw %%xmm0,%%xmm0 \n" 1.284 + "movdqa %%xmm0,%%xmm1 \n" 1.285 + "punpcklwd %%xmm0,%%xmm0 \n" 1.286 + "punpckhwd %%xmm1,%%xmm1 \n" 1.287 + "por %%xmm5,%%xmm0 \n" 1.288 + "por %%xmm5,%%xmm1 \n" 1.289 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.290 + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 1.291 + "lea " MEMLEA(0x20,1) ",%1 \n" 1.292 + "sub $0x8,%2 \n" 1.293 + "jg 1b \n" 1.294 + : "+r"(src_y), // %0 1.295 + "+r"(dst_argb), // %1 1.296 + "+r"(pix) // %2 1.297 + : 1.298 + : "memory", "cc" 1.299 +#if defined(__SSE2__) 1.300 + , "xmm0", "xmm1", "xmm5" 1.301 +#endif 1.302 + ); 1.303 +} 1.304 +#endif // HAS_I400TOARGBROW_SSE2 1.305 + 1.306 +#ifdef HAS_RGB24TOARGBROW_SSSE3 1.307 +void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { 1.308 + asm volatile ( 1.309 + "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 1.310 + "pslld $0x18,%%xmm5 \n" 1.311 + "movdqa %3,%%xmm4 \n" 1.312 + LABELALIGN 1.313 + "1: \n" 1.314 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.315 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.316 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" 1.317 + "lea " MEMLEA(0x30,0) ",%0 \n" 1.318 + "movdqa %%xmm3,%%xmm2 \n" 1.319 + "palignr $0x8,%%xmm1,%%xmm2 \n" 1.320 + "pshufb %%xmm4,%%xmm2 \n" 1.321 + "por %%xmm5,%%xmm2 \n" 1.322 + "palignr $0xc,%%xmm0,%%xmm1 \n" 1.323 + "pshufb %%xmm4,%%xmm0 \n" 1.324 + "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" 1.325 + "por %%xmm5,%%xmm0 \n" 1.326 + "pshufb %%xmm4,%%xmm1 \n" 1.327 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.328 + "por %%xmm5,%%xmm1 \n" 1.329 + "palignr $0x4,%%xmm3,%%xmm3 \n" 1.330 + "pshufb %%xmm4,%%xmm3 \n" 1.331 + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" 1.332 + "por %%xmm5,%%xmm3 \n" 1.333 + "sub $0x10,%2 \n" 1.334 + "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" 1.335 + "lea " MEMLEA(0x40,1) ",%1 \n" 1.336 + "jg 1b \n" 1.337 + : "+r"(src_rgb24), // %0 1.338 + "+r"(dst_argb), // %1 1.339 + "+r"(pix) // %2 1.340 + : "m"(kShuffleMaskRGB24ToARGB) // %3 1.341 + : "memory", "cc" 1.342 +#if defined(__SSE2__) 1.343 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.344 +#endif 1.345 + ); 1.346 +} 1.347 + 1.348 +void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { 1.349 + asm volatile ( 1.350 + "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 1.351 + "pslld $0x18,%%xmm5 \n" 1.352 + "movdqa %3,%%xmm4 \n" 1.353 + LABELALIGN 1.354 + "1: \n" 1.355 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.356 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.357 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" 1.358 + "lea " MEMLEA(0x30,0) ",%0 \n" 1.359 + "movdqa %%xmm3,%%xmm2 \n" 1.360 + "palignr $0x8,%%xmm1,%%xmm2 \n" 1.361 + "pshufb %%xmm4,%%xmm2 \n" 1.362 + "por %%xmm5,%%xmm2 \n" 1.363 + "palignr $0xc,%%xmm0,%%xmm1 \n" 1.364 + "pshufb %%xmm4,%%xmm0 \n" 1.365 + "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" 1.366 + "por %%xmm5,%%xmm0 \n" 1.367 + "pshufb %%xmm4,%%xmm1 \n" 1.368 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.369 + "por %%xmm5,%%xmm1 \n" 1.370 + "palignr $0x4,%%xmm3,%%xmm3 \n" 1.371 + "pshufb %%xmm4,%%xmm3 \n" 1.372 + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" 1.373 + "por %%xmm5,%%xmm3 \n" 1.374 + "sub $0x10,%2 \n" 1.375 + "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" 1.376 + "lea " MEMLEA(0x40,1) ",%1 \n" 1.377 + "jg 1b \n" 1.378 + : "+r"(src_raw), // %0 1.379 + "+r"(dst_argb), // %1 1.380 + "+r"(pix) // %2 1.381 + : "m"(kShuffleMaskRAWToARGB) // %3 1.382 + : "memory", "cc" 1.383 +#if defined(__SSE2__) 1.384 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.385 +#endif 1.386 + ); 1.387 +} 1.388 + 1.389 +void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { 1.390 + asm volatile ( 1.391 + "mov $0x1080108,%%eax \n" 1.392 + "movd %%eax,%%xmm5 \n" 1.393 + "pshufd $0x0,%%xmm5,%%xmm5 \n" 1.394 + "mov $0x20802080,%%eax \n" 1.395 + "movd %%eax,%%xmm6 \n" 1.396 + "pshufd $0x0,%%xmm6,%%xmm6 \n" 1.397 + "pcmpeqb %%xmm3,%%xmm3 \n" 1.398 + "psllw $0xb,%%xmm3 \n" 1.399 + "pcmpeqb %%xmm4,%%xmm4 \n" 1.400 + "psllw $0xa,%%xmm4 \n" 1.401 + "psrlw $0x5,%%xmm4 \n" 1.402 + "pcmpeqb %%xmm7,%%xmm7 \n" 1.403 + "psllw $0x8,%%xmm7 \n" 1.404 + "sub %0,%1 \n" 1.405 + "sub %0,%1 \n" 1.406 + LABELALIGN 1.407 + "1: \n" 1.408 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.409 + "movdqa %%xmm0,%%xmm1 \n" 1.410 + "movdqa %%xmm0,%%xmm2 \n" 1.411 + "pand %%xmm3,%%xmm1 \n" 1.412 + "psllw $0xb,%%xmm2 \n" 1.413 + "pmulhuw %%xmm5,%%xmm1 \n" 1.414 + "pmulhuw %%xmm5,%%xmm2 \n" 1.415 + "psllw $0x8,%%xmm1 \n" 1.416 + "por %%xmm2,%%xmm1 \n" 1.417 + "pand %%xmm4,%%xmm0 \n" 1.418 + "pmulhuw %%xmm6,%%xmm0 \n" 1.419 + "por %%xmm7,%%xmm0 \n" 1.420 + "movdqa %%xmm1,%%xmm2 \n" 1.421 + "punpcklbw %%xmm0,%%xmm1 \n" 1.422 + "punpckhbw %%xmm0,%%xmm2 \n" 1.423 + BUNDLEALIGN 1.424 + MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) 1.425 + MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) 1.426 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.427 + "sub $0x8,%2 \n" 1.428 + "jg 1b \n" 1.429 + : "+r"(src), // %0 1.430 + "+r"(dst), // %1 1.431 + "+r"(pix) // %2 1.432 + : 1.433 + : "memory", "cc", "eax" 1.434 +#if defined(__native_client__) && defined(__x86_64__) 1.435 + , "r14" 1.436 +#endif 1.437 +#if defined(__SSE2__) 1.438 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1.439 +#endif 1.440 + ); 1.441 +} 1.442 + 1.443 +void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { 1.444 + asm volatile ( 1.445 + "mov $0x1080108,%%eax \n" 1.446 + "movd %%eax,%%xmm5 \n" 1.447 + "pshufd $0x0,%%xmm5,%%xmm5 \n" 1.448 + "mov $0x42004200,%%eax \n" 1.449 + "movd %%eax,%%xmm6 \n" 1.450 + "pshufd $0x0,%%xmm6,%%xmm6 \n" 1.451 + "pcmpeqb %%xmm3,%%xmm3 \n" 1.452 + "psllw $0xb,%%xmm3 \n" 1.453 + "movdqa %%xmm3,%%xmm4 \n" 1.454 + "psrlw $0x6,%%xmm4 \n" 1.455 + "pcmpeqb %%xmm7,%%xmm7 \n" 1.456 + "psllw $0x8,%%xmm7 \n" 1.457 + "sub %0,%1 \n" 1.458 + "sub %0,%1 \n" 1.459 + LABELALIGN 1.460 + "1: \n" 1.461 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.462 + "movdqa %%xmm0,%%xmm1 \n" 1.463 + "movdqa %%xmm0,%%xmm2 \n" 1.464 + "psllw $0x1,%%xmm1 \n" 1.465 + "psllw $0xb,%%xmm2 \n" 1.466 + "pand %%xmm3,%%xmm1 \n" 1.467 + "pmulhuw %%xmm5,%%xmm2 \n" 1.468 + "pmulhuw %%xmm5,%%xmm1 \n" 1.469 + "psllw $0x8,%%xmm1 \n" 1.470 + "por %%xmm2,%%xmm1 \n" 1.471 + "movdqa %%xmm0,%%xmm2 \n" 1.472 + "pand %%xmm4,%%xmm0 \n" 1.473 + "psraw $0x8,%%xmm2 \n" 1.474 + "pmulhuw %%xmm6,%%xmm0 \n" 1.475 + "pand %%xmm7,%%xmm2 \n" 1.476 + "por %%xmm2,%%xmm0 \n" 1.477 + "movdqa %%xmm1,%%xmm2 \n" 1.478 + "punpcklbw %%xmm0,%%xmm1 \n" 1.479 + "punpckhbw %%xmm0,%%xmm2 \n" 1.480 + BUNDLEALIGN 1.481 + MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) 1.482 + MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) 1.483 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.484 + "sub $0x8,%2 \n" 1.485 + "jg 1b \n" 1.486 + : "+r"(src), // %0 1.487 + "+r"(dst), // %1 1.488 + "+r"(pix) // %2 1.489 + : 1.490 + : "memory", "cc", "eax" 1.491 +#if defined(__native_client__) && defined(__x86_64__) 1.492 + , "r14" 1.493 +#endif 1.494 +#if defined(__SSE2__) 1.495 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1.496 +#endif 1.497 + ); 1.498 +} 1.499 + 1.500 +void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { 1.501 + asm volatile ( 1.502 + "mov $0xf0f0f0f,%%eax \n" 1.503 + "movd %%eax,%%xmm4 \n" 1.504 + "pshufd $0x0,%%xmm4,%%xmm4 \n" 1.505 + "movdqa %%xmm4,%%xmm5 \n" 1.506 + "pslld $0x4,%%xmm5 \n" 1.507 + "sub %0,%1 \n" 1.508 + "sub %0,%1 \n" 1.509 + LABELALIGN 1.510 + "1: \n" 1.511 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.512 + "movdqa %%xmm0,%%xmm2 \n" 1.513 + "pand %%xmm4,%%xmm0 \n" 1.514 + "pand %%xmm5,%%xmm2 \n" 1.515 + "movdqa %%xmm0,%%xmm1 \n" 1.516 + "movdqa %%xmm2,%%xmm3 \n" 1.517 + "psllw $0x4,%%xmm1 \n" 1.518 + "psrlw $0x4,%%xmm3 \n" 1.519 + "por %%xmm1,%%xmm0 \n" 1.520 + "por %%xmm3,%%xmm2 \n" 1.521 + "movdqa %%xmm0,%%xmm1 \n" 1.522 + "punpcklbw %%xmm2,%%xmm0 \n" 1.523 + "punpckhbw %%xmm2,%%xmm1 \n" 1.524 + BUNDLEALIGN 1.525 + MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2) 1.526 + MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2) 1.527 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.528 + "sub $0x8,%2 \n" 1.529 + "jg 1b \n" 1.530 + : "+r"(src), // %0 1.531 + "+r"(dst), // %1 1.532 + "+r"(pix) // %2 1.533 + : 1.534 + : "memory", "cc", "eax" 1.535 +#if defined(__native_client__) && defined(__x86_64__) 1.536 + , "r14" 1.537 +#endif 1.538 +#if defined(__SSE2__) 1.539 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.540 +#endif 1.541 + ); 1.542 +} 1.543 + 1.544 +void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { 1.545 + asm volatile ( 1.546 + "movdqa %3,%%xmm6 \n" 1.547 + LABELALIGN 1.548 + "1: \n" 1.549 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.550 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.551 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.552 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1.553 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.554 + "pshufb %%xmm6,%%xmm0 \n" 1.555 + "pshufb %%xmm6,%%xmm1 \n" 1.556 + "pshufb %%xmm6,%%xmm2 \n" 1.557 + "pshufb %%xmm6,%%xmm3 \n" 1.558 + "movdqa %%xmm1,%%xmm4 \n" 1.559 + "psrldq $0x4,%%xmm1 \n" 1.560 + "pslldq $0xc,%%xmm4 \n" 1.561 + "movdqa %%xmm2,%%xmm5 \n" 1.562 + "por %%xmm4,%%xmm0 \n" 1.563 + "pslldq $0x8,%%xmm5 \n" 1.564 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.565 + "por %%xmm5,%%xmm1 \n" 1.566 + "psrldq $0x8,%%xmm2 \n" 1.567 + "pslldq $0x4,%%xmm3 \n" 1.568 + "por %%xmm3,%%xmm2 \n" 1.569 + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 1.570 + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" 1.571 + "lea " MEMLEA(0x30,1) ",%1 \n" 1.572 + "sub $0x10,%2 \n" 1.573 + "jg 1b \n" 1.574 + : "+r"(src), // %0 1.575 + "+r"(dst), // %1 1.576 + "+r"(pix) // %2 1.577 + : "m"(kShuffleMaskARGBToRGB24) // %3 1.578 + : "memory", "cc" 1.579 +#if defined(__SSE2__) 1.580 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 1.581 +#endif 1.582 + ); 1.583 +} 1.584 + 1.585 +void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { 1.586 + asm volatile ( 1.587 + "movdqa %3,%%xmm6 \n" 1.588 + LABELALIGN 1.589 + "1: \n" 1.590 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.591 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.592 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.593 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1.594 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.595 + "pshufb %%xmm6,%%xmm0 \n" 1.596 + "pshufb %%xmm6,%%xmm1 \n" 1.597 + "pshufb %%xmm6,%%xmm2 \n" 1.598 + "pshufb %%xmm6,%%xmm3 \n" 1.599 + "movdqa %%xmm1,%%xmm4 \n" 1.600 + "psrldq $0x4,%%xmm1 \n" 1.601 + "pslldq $0xc,%%xmm4 \n" 1.602 + "movdqa %%xmm2,%%xmm5 \n" 1.603 + "por %%xmm4,%%xmm0 \n" 1.604 + "pslldq $0x8,%%xmm5 \n" 1.605 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.606 + "por %%xmm5,%%xmm1 \n" 1.607 + "psrldq $0x8,%%xmm2 \n" 1.608 + "pslldq $0x4,%%xmm3 \n" 1.609 + "por %%xmm3,%%xmm2 \n" 1.610 + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 1.611 + "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" 1.612 + "lea " MEMLEA(0x30,1) ",%1 \n" 1.613 + "sub $0x10,%2 \n" 1.614 + "jg 1b \n" 1.615 + : "+r"(src), // %0 1.616 + "+r"(dst), // %1 1.617 + "+r"(pix) // %2 1.618 + : "m"(kShuffleMaskARGBToRAW) // %3 1.619 + : "memory", "cc" 1.620 +#if defined(__SSE2__) 1.621 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 1.622 +#endif 1.623 + ); 1.624 +} 1.625 + 1.626 +void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { 1.627 + asm volatile ( 1.628 + "pcmpeqb %%xmm3,%%xmm3 \n" 1.629 + "psrld $0x1b,%%xmm3 \n" 1.630 + "pcmpeqb %%xmm4,%%xmm4 \n" 1.631 + "psrld $0x1a,%%xmm4 \n" 1.632 + "pslld $0x5,%%xmm4 \n" 1.633 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.634 + "pslld $0xb,%%xmm5 \n" 1.635 + LABELALIGN 1.636 + "1: \n" 1.637 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.638 + "movdqa %%xmm0,%%xmm1 \n" 1.639 + "movdqa %%xmm0,%%xmm2 \n" 1.640 + "pslld $0x8,%%xmm0 \n" 1.641 + "psrld $0x3,%%xmm1 \n" 1.642 + "psrld $0x5,%%xmm2 \n" 1.643 + "psrad $0x10,%%xmm0 \n" 1.644 + "pand %%xmm3,%%xmm1 \n" 1.645 + "pand %%xmm4,%%xmm2 \n" 1.646 + "pand %%xmm5,%%xmm0 \n" 1.647 + "por %%xmm2,%%xmm1 \n" 1.648 + "por %%xmm1,%%xmm0 \n" 1.649 + "packssdw %%xmm0,%%xmm0 \n" 1.650 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.651 + "movq %%xmm0," MEMACCESS(1) " \n" 1.652 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.653 + "sub $0x4,%2 \n" 1.654 + "jg 1b \n" 1.655 + : "+r"(src), // %0 1.656 + "+r"(dst), // %1 1.657 + "+r"(pix) // %2 1.658 + : 1.659 + : "memory", "cc" 1.660 +#if defined(__SSE2__) 1.661 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.662 +#endif 1.663 + ); 1.664 +} 1.665 + 1.666 +void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { 1.667 + asm volatile ( 1.668 + "pcmpeqb %%xmm4,%%xmm4 \n" 1.669 + "psrld $0x1b,%%xmm4 \n" 1.670 + "movdqa %%xmm4,%%xmm5 \n" 1.671 + "pslld $0x5,%%xmm5 \n" 1.672 + "movdqa %%xmm4,%%xmm6 \n" 1.673 + "pslld $0xa,%%xmm6 \n" 1.674 + "pcmpeqb %%xmm7,%%xmm7 \n" 1.675 + "pslld $0xf,%%xmm7 \n" 1.676 + LABELALIGN 1.677 + "1: \n" 1.678 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.679 + "movdqa %%xmm0,%%xmm1 \n" 1.680 + "movdqa %%xmm0,%%xmm2 \n" 1.681 + "movdqa %%xmm0,%%xmm3 \n" 1.682 + "psrad $0x10,%%xmm0 \n" 1.683 + "psrld $0x3,%%xmm1 \n" 1.684 + "psrld $0x6,%%xmm2 \n" 1.685 + "psrld $0x9,%%xmm3 \n" 1.686 + "pand %%xmm7,%%xmm0 \n" 1.687 + "pand %%xmm4,%%xmm1 \n" 1.688 + "pand %%xmm5,%%xmm2 \n" 1.689 + "pand %%xmm6,%%xmm3 \n" 1.690 + "por %%xmm1,%%xmm0 \n" 1.691 + "por %%xmm3,%%xmm2 \n" 1.692 + "por %%xmm2,%%xmm0 \n" 1.693 + "packssdw %%xmm0,%%xmm0 \n" 1.694 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.695 + "movq %%xmm0," MEMACCESS(1) " \n" 1.696 + "lea " MEMACCESS2(0x8,1) ",%1 \n" 1.697 + "sub $0x4,%2 \n" 1.698 + "jg 1b \n" 1.699 + : "+r"(src), // %0 1.700 + "+r"(dst), // %1 1.701 + "+r"(pix) // %2 1.702 + : 1.703 + : "memory", "cc" 1.704 +#if defined(__SSE2__) 1.705 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1.706 +#endif 1.707 + ); 1.708 +} 1.709 + 1.710 +void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { 1.711 + asm volatile ( 1.712 + "pcmpeqb %%xmm4,%%xmm4 \n" 1.713 + "psllw $0xc,%%xmm4 \n" 1.714 + "movdqa %%xmm4,%%xmm3 \n" 1.715 + "psrlw $0x8,%%xmm3 \n" 1.716 + LABELALIGN 1.717 + "1: \n" 1.718 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.719 + "movdqa %%xmm0,%%xmm1 \n" 1.720 + "pand %%xmm3,%%xmm0 \n" 1.721 + "pand %%xmm4,%%xmm1 \n" 1.722 + "psrlq $0x4,%%xmm0 \n" 1.723 + "psrlq $0x8,%%xmm1 \n" 1.724 + "por %%xmm1,%%xmm0 \n" 1.725 + "packuswb %%xmm0,%%xmm0 \n" 1.726 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.727 + "movq %%xmm0," MEMACCESS(1) " \n" 1.728 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.729 + "sub $0x4,%2 \n" 1.730 + "jg 1b \n" 1.731 + : "+r"(src), // %0 1.732 + "+r"(dst), // %1 1.733 + "+r"(pix) // %2 1.734 + : 1.735 + : "memory", "cc" 1.736 +#if defined(__SSE2__) 1.737 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 1.738 +#endif 1.739 + ); 1.740 +} 1.741 +#endif // HAS_RGB24TOARGBROW_SSSE3 1.742 + 1.743 +#ifdef HAS_ARGBTOYROW_SSSE3 1.744 +void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1.745 + asm volatile ( 1.746 + "movdqa %4,%%xmm5 \n" 1.747 + "movdqa %3,%%xmm4 \n" 1.748 + LABELALIGN 1.749 + "1: \n" 1.750 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.751 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.752 + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.753 + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1.754 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.755 + "pmaddubsw %%xmm4,%%xmm1 \n" 1.756 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.757 + "pmaddubsw %%xmm4,%%xmm3 \n" 1.758 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.759 + "phaddw %%xmm1,%%xmm0 \n" 1.760 + "phaddw %%xmm3,%%xmm2 \n" 1.761 + "psrlw $0x7,%%xmm0 \n" 1.762 + "psrlw $0x7,%%xmm2 \n" 1.763 + "packuswb %%xmm2,%%xmm0 \n" 1.764 + "paddb %%xmm5,%%xmm0 \n" 1.765 + "sub $0x10,%2 \n" 1.766 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.767 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.768 + "jg 1b \n" 1.769 + : "+r"(src_argb), // %0 1.770 + "+r"(dst_y), // %1 1.771 + "+r"(pix) // %2 1.772 + : "m"(kARGBToY), // %3 1.773 + "m"(kAddY16) // %4 1.774 + : "memory", "cc" 1.775 +#if defined(__SSE2__) 1.776 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.777 +#endif 1.778 + ); 1.779 +} 1.780 + 1.781 +void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1.782 + asm volatile ( 1.783 + "movdqa %4,%%xmm5 \n" 1.784 + "movdqa %3,%%xmm4 \n" 1.785 + LABELALIGN 1.786 + "1: \n" 1.787 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.788 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.789 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.790 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1.791 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.792 + "pmaddubsw %%xmm4,%%xmm1 \n" 1.793 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.794 + "pmaddubsw %%xmm4,%%xmm3 \n" 1.795 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.796 + "phaddw %%xmm1,%%xmm0 \n" 1.797 + "phaddw %%xmm3,%%xmm2 \n" 1.798 + "psrlw $0x7,%%xmm0 \n" 1.799 + "psrlw $0x7,%%xmm2 \n" 1.800 + "packuswb %%xmm2,%%xmm0 \n" 1.801 + "paddb %%xmm5,%%xmm0 \n" 1.802 + "sub $0x10,%2 \n" 1.803 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.804 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.805 + "jg 1b \n" 1.806 + : "+r"(src_argb), // %0 1.807 + "+r"(dst_y), // %1 1.808 + "+r"(pix) // %2 1.809 + : "m"(kARGBToY), // %3 1.810 + "m"(kAddY16) // %4 1.811 + : "memory", "cc" 1.812 +#if defined(__SSE2__) 1.813 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.814 +#endif 1.815 + ); 1.816 +} 1.817 +#endif // HAS_ARGBTOYROW_SSSE3 1.818 + 1.819 +#ifdef HAS_ARGBTOYJROW_SSSE3 1.820 +void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1.821 + asm volatile ( 1.822 + "movdqa %3,%%xmm4 \n" 1.823 + "movdqa %4,%%xmm5 \n" 1.824 + LABELALIGN 1.825 + "1: \n" 1.826 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.827 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.828 + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.829 + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1.830 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.831 + "pmaddubsw %%xmm4,%%xmm1 \n" 1.832 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.833 + "pmaddubsw %%xmm4,%%xmm3 \n" 1.834 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.835 + "phaddw %%xmm1,%%xmm0 \n" 1.836 + "phaddw %%xmm3,%%xmm2 \n" 1.837 + "paddw %%xmm5,%%xmm0 \n" 1.838 + "paddw %%xmm5,%%xmm2 \n" 1.839 + "psrlw $0x7,%%xmm0 \n" 1.840 + "psrlw $0x7,%%xmm2 \n" 1.841 + "packuswb %%xmm2,%%xmm0 \n" 1.842 + "sub $0x10,%2 \n" 1.843 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.844 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.845 + "jg 1b \n" 1.846 + : "+r"(src_argb), // %0 1.847 + "+r"(dst_y), // %1 1.848 + "+r"(pix) // %2 1.849 + : "m"(kARGBToYJ), // %3 1.850 + "m"(kAddYJ64) // %4 1.851 + : "memory", "cc" 1.852 +#if defined(__SSE2__) 1.853 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.854 +#endif 1.855 + ); 1.856 +} 1.857 + 1.858 +void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1.859 + asm volatile ( 1.860 + "movdqa %3,%%xmm4 \n" 1.861 + "movdqa %4,%%xmm5 \n" 1.862 + LABELALIGN 1.863 + "1: \n" 1.864 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.865 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.866 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.867 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1.868 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.869 + "pmaddubsw %%xmm4,%%xmm1 \n" 1.870 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.871 + "pmaddubsw %%xmm4,%%xmm3 \n" 1.872 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.873 + "phaddw %%xmm1,%%xmm0 \n" 1.874 + "phaddw %%xmm3,%%xmm2 \n" 1.875 + "paddw %%xmm5,%%xmm0 \n" 1.876 + "paddw %%xmm5,%%xmm2 \n" 1.877 + "psrlw $0x7,%%xmm0 \n" 1.878 + "psrlw $0x7,%%xmm2 \n" 1.879 + "packuswb %%xmm2,%%xmm0 \n" 1.880 + "sub $0x10,%2 \n" 1.881 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.882 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.883 + "jg 1b \n" 1.884 + : "+r"(src_argb), // %0 1.885 + "+r"(dst_y), // %1 1.886 + "+r"(pix) // %2 1.887 + : "m"(kARGBToYJ), // %3 1.888 + "m"(kAddYJ64) // %4 1.889 + : "memory", "cc" 1.890 +#if defined(__SSE2__) 1.891 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.892 +#endif 1.893 + ); 1.894 +} 1.895 +#endif // HAS_ARGBTOYJROW_SSSE3 1.896 + 1.897 +#ifdef HAS_ARGBTOUVROW_SSSE3 1.898 +// TODO(fbarchard): pass xmm constants to single block of assembly. 1.899 +// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes 1.900 +// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers, 1.901 +// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around 1.902 +// and considered unsafe. 1.903 +void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1.904 + uint8* dst_u, uint8* dst_v, int width) { 1.905 + asm volatile ( 1.906 + "movdqa %0,%%xmm4 \n" 1.907 + "movdqa %1,%%xmm3 \n" 1.908 + "movdqa %2,%%xmm5 \n" 1.909 + : 1.910 + : "m"(kARGBToU), // %0 1.911 + "m"(kARGBToV), // %1 1.912 + "m"(kAddUV128) // %2 1.913 + ); 1.914 + asm volatile ( 1.915 + "sub %1,%2 \n" 1.916 + LABELALIGN 1.917 + "1: \n" 1.918 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.919 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.920 + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.921 + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1.922 + BUNDLEALIGN 1.923 + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 1.924 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 1.925 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 1.926 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 1.927 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.928 + "movdqa %%xmm0,%%xmm7 \n" 1.929 + "shufps $0x88,%%xmm1,%%xmm0 \n" 1.930 + "shufps $0xdd,%%xmm1,%%xmm7 \n" 1.931 + "pavgb %%xmm7,%%xmm0 \n" 1.932 + "movdqa %%xmm2,%%xmm7 \n" 1.933 + "shufps $0x88,%%xmm6,%%xmm2 \n" 1.934 + "shufps $0xdd,%%xmm6,%%xmm7 \n" 1.935 + "pavgb %%xmm7,%%xmm2 \n" 1.936 + "movdqa %%xmm0,%%xmm1 \n" 1.937 + "movdqa %%xmm2,%%xmm6 \n" 1.938 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.939 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.940 + "pmaddubsw %%xmm3,%%xmm1 \n" 1.941 + "pmaddubsw %%xmm3,%%xmm6 \n" 1.942 + "phaddw %%xmm2,%%xmm0 \n" 1.943 + "phaddw %%xmm6,%%xmm1 \n" 1.944 + "psraw $0x8,%%xmm0 \n" 1.945 + "psraw $0x8,%%xmm1 \n" 1.946 + "packsswb %%xmm1,%%xmm0 \n" 1.947 + "paddb %%xmm5,%%xmm0 \n" 1.948 + "sub $0x10,%3 \n" 1.949 + "movlps %%xmm0," MEMACCESS(1) " \n" 1.950 + BUNDLEALIGN 1.951 + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1.952 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.953 + "jg 1b \n" 1.954 + : "+r"(src_argb0), // %0 1.955 + "+r"(dst_u), // %1 1.956 + "+r"(dst_v), // %2 1.957 + "+rm"(width) // %3 1.958 + : "r"((intptr_t)(src_stride_argb)) // %4 1.959 + : "memory", "cc" 1.960 +#if defined(__native_client__) && defined(__x86_64__) 1.961 + , "r14" 1.962 +#endif 1.963 +#if defined(__SSE2__) 1.964 + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1.965 +#endif 1.966 + ); 1.967 +} 1.968 + 1.969 +// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3. 1.970 +void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1.971 + uint8* dst_u, uint8* dst_v, int width) { 1.972 + asm volatile ( 1.973 + "movdqa %0,%%xmm4 \n" 1.974 + "movdqa %1,%%xmm3 \n" 1.975 + "movdqa %2,%%xmm5 \n" 1.976 + : 1.977 + : "m"(kARGBToUJ), // %0 1.978 + "m"(kARGBToVJ), // %1 1.979 + "m"(kAddUVJ128) // %2 1.980 + ); 1.981 + asm volatile ( 1.982 + "sub %1,%2 \n" 1.983 + LABELALIGN 1.984 + "1: \n" 1.985 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.986 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.987 + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.988 + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1.989 + BUNDLEALIGN 1.990 + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 1.991 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 1.992 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 1.993 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 1.994 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.995 + "movdqa %%xmm0,%%xmm7 \n" 1.996 + "shufps $0x88,%%xmm1,%%xmm0 \n" 1.997 + "shufps $0xdd,%%xmm1,%%xmm7 \n" 1.998 + "pavgb %%xmm7,%%xmm0 \n" 1.999 + "movdqa %%xmm2,%%xmm7 \n" 1.1000 + "shufps $0x88,%%xmm6,%%xmm2 \n" 1.1001 + "shufps $0xdd,%%xmm6,%%xmm7 \n" 1.1002 + "pavgb %%xmm7,%%xmm2 \n" 1.1003 + "movdqa %%xmm0,%%xmm1 \n" 1.1004 + "movdqa %%xmm2,%%xmm6 \n" 1.1005 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.1006 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.1007 + "pmaddubsw %%xmm3,%%xmm1 \n" 1.1008 + "pmaddubsw %%xmm3,%%xmm6 \n" 1.1009 + "phaddw %%xmm2,%%xmm0 \n" 1.1010 + "phaddw %%xmm6,%%xmm1 \n" 1.1011 + "paddw %%xmm5,%%xmm0 \n" 1.1012 + "paddw %%xmm5,%%xmm1 \n" 1.1013 + "psraw $0x8,%%xmm0 \n" 1.1014 + "psraw $0x8,%%xmm1 \n" 1.1015 + "packsswb %%xmm1,%%xmm0 \n" 1.1016 + "sub $0x10,%3 \n" 1.1017 + "movlps %%xmm0," MEMACCESS(1) " \n" 1.1018 + BUNDLEALIGN 1.1019 + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1.1020 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.1021 + "jg 1b \n" 1.1022 + : "+r"(src_argb0), // %0 1.1023 + "+r"(dst_u), // %1 1.1024 + "+r"(dst_v), // %2 1.1025 + "+rm"(width) // %3 1.1026 + : "r"((intptr_t)(src_stride_argb)) // %4 1.1027 + : "memory", "cc" 1.1028 +#if defined(__native_client__) && defined(__x86_64__) 1.1029 + , "r14" 1.1030 +#endif 1.1031 +#if defined(__SSE2__) 1.1032 + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1.1033 +#endif 1.1034 + ); 1.1035 +} 1.1036 + 1.1037 +void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1.1038 + uint8* dst_u, uint8* dst_v, int width) { 1.1039 + asm volatile ( 1.1040 + "movdqa %0,%%xmm4 \n" 1.1041 + "movdqa %1,%%xmm3 \n" 1.1042 + "movdqa %2,%%xmm5 \n" 1.1043 + : 1.1044 + : "m"(kARGBToU), // %0 1.1045 + "m"(kARGBToV), // %1 1.1046 + "m"(kAddUV128) // %2 1.1047 + ); 1.1048 + asm volatile ( 1.1049 + "sub %1,%2 \n" 1.1050 + LABELALIGN 1.1051 + "1: \n" 1.1052 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.1053 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1054 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1055 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1.1056 + BUNDLEALIGN 1.1057 + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1.1058 + "pavgb %%xmm7,%%xmm0 \n" 1.1059 + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 1.1060 + "pavgb %%xmm7,%%xmm1 \n" 1.1061 + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 1.1062 + "pavgb %%xmm7,%%xmm2 \n" 1.1063 + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 1.1064 + "pavgb %%xmm7,%%xmm6 \n" 1.1065 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.1066 + "movdqa %%xmm0,%%xmm7 \n" 1.1067 + "shufps $0x88,%%xmm1,%%xmm0 \n" 1.1068 + "shufps $0xdd,%%xmm1,%%xmm7 \n" 1.1069 + "pavgb %%xmm7,%%xmm0 \n" 1.1070 + "movdqa %%xmm2,%%xmm7 \n" 1.1071 + "shufps $0x88,%%xmm6,%%xmm2 \n" 1.1072 + "shufps $0xdd,%%xmm6,%%xmm7 \n" 1.1073 + "pavgb %%xmm7,%%xmm2 \n" 1.1074 + "movdqa %%xmm0,%%xmm1 \n" 1.1075 + "movdqa %%xmm2,%%xmm6 \n" 1.1076 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.1077 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.1078 + "pmaddubsw %%xmm3,%%xmm1 \n" 1.1079 + "pmaddubsw %%xmm3,%%xmm6 \n" 1.1080 + "phaddw %%xmm2,%%xmm0 \n" 1.1081 + "phaddw %%xmm6,%%xmm1 \n" 1.1082 + "psraw $0x8,%%xmm0 \n" 1.1083 + "psraw $0x8,%%xmm1 \n" 1.1084 + "packsswb %%xmm1,%%xmm0 \n" 1.1085 + "paddb %%xmm5,%%xmm0 \n" 1.1086 + "sub $0x10,%3 \n" 1.1087 + "movlps %%xmm0," MEMACCESS(1) " \n" 1.1088 + BUNDLEALIGN 1.1089 + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1.1090 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.1091 + "jg 1b \n" 1.1092 + : "+r"(src_argb0), // %0 1.1093 + "+r"(dst_u), // %1 1.1094 + "+r"(dst_v), // %2 1.1095 + "+rm"(width) // %3 1.1096 + : "r"((intptr_t)(src_stride_argb)) // %4 1.1097 + : "memory", "cc" 1.1098 +#if defined(__native_client__) && defined(__x86_64__) 1.1099 + , "r14" 1.1100 +#endif 1.1101 +#if defined(__SSE2__) 1.1102 + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1.1103 +#endif 1.1104 + ); 1.1105 +} 1.1106 + 1.1107 +void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 1.1108 + uint8* dst_u, uint8* dst_v, int width) { 1.1109 + asm volatile ( 1.1110 + "movdqa %0,%%xmm4 \n" 1.1111 + "movdqa %1,%%xmm3 \n" 1.1112 + "movdqa %2,%%xmm5 \n" 1.1113 + : 1.1114 + : "m"(kARGBToUJ), // %0 1.1115 + "m"(kARGBToVJ), // %1 1.1116 + "m"(kAddUVJ128) // %2 1.1117 + ); 1.1118 + asm volatile ( 1.1119 + "sub %1,%2 \n" 1.1120 + LABELALIGN 1.1121 + "1: \n" 1.1122 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.1123 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1124 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1125 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1.1126 + BUNDLEALIGN 1.1127 + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1.1128 + "pavgb %%xmm7,%%xmm0 \n" 1.1129 + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 1.1130 + "pavgb %%xmm7,%%xmm1 \n" 1.1131 + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 1.1132 + "pavgb %%xmm7,%%xmm2 \n" 1.1133 + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 1.1134 + "pavgb %%xmm7,%%xmm6 \n" 1.1135 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.1136 + "movdqa %%xmm0,%%xmm7 \n" 1.1137 + "shufps $0x88,%%xmm1,%%xmm0 \n" 1.1138 + "shufps $0xdd,%%xmm1,%%xmm7 \n" 1.1139 + "pavgb %%xmm7,%%xmm0 \n" 1.1140 + "movdqa %%xmm2,%%xmm7 \n" 1.1141 + "shufps $0x88,%%xmm6,%%xmm2 \n" 1.1142 + "shufps $0xdd,%%xmm6,%%xmm7 \n" 1.1143 + "pavgb %%xmm7,%%xmm2 \n" 1.1144 + "movdqa %%xmm0,%%xmm1 \n" 1.1145 + "movdqa %%xmm2,%%xmm6 \n" 1.1146 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.1147 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.1148 + "pmaddubsw %%xmm3,%%xmm1 \n" 1.1149 + "pmaddubsw %%xmm3,%%xmm6 \n" 1.1150 + "phaddw %%xmm2,%%xmm0 \n" 1.1151 + "phaddw %%xmm6,%%xmm1 \n" 1.1152 + "paddw %%xmm5,%%xmm0 \n" 1.1153 + "paddw %%xmm5,%%xmm1 \n" 1.1154 + "psraw $0x8,%%xmm0 \n" 1.1155 + "psraw $0x8,%%xmm1 \n" 1.1156 + "packsswb %%xmm1,%%xmm0 \n" 1.1157 + "sub $0x10,%3 \n" 1.1158 + "movlps %%xmm0," MEMACCESS(1) " \n" 1.1159 + BUNDLEALIGN 1.1160 + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1.1161 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.1162 + "jg 1b \n" 1.1163 + : "+r"(src_argb0), // %0 1.1164 + "+r"(dst_u), // %1 1.1165 + "+r"(dst_v), // %2 1.1166 + "+rm"(width) // %3 1.1167 + : "r"((intptr_t)(src_stride_argb)) 1.1168 + : "memory", "cc" 1.1169 +#if defined(__native_client__) && defined(__x86_64__) 1.1170 + , "r14" 1.1171 +#endif 1.1172 +#if defined(__SSE2__) 1.1173 + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1.1174 +#endif 1.1175 + ); 1.1176 +} 1.1177 + 1.1178 +void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, 1.1179 + int width) { 1.1180 + asm volatile ( 1.1181 + "movdqa %0,%%xmm4 \n" 1.1182 + "movdqa %1,%%xmm3 \n" 1.1183 + "movdqa %2,%%xmm5 \n" 1.1184 + : 1.1185 + : "m"(kARGBToU), // %0 1.1186 + "m"(kARGBToV), // %1 1.1187 + "m"(kAddUV128) // %2 1.1188 + ); 1.1189 + asm volatile ( 1.1190 + "sub %1,%2 \n" 1.1191 + LABELALIGN 1.1192 + "1: \n" 1.1193 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.1194 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1195 + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1196 + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1.1197 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.1198 + "pmaddubsw %%xmm4,%%xmm1 \n" 1.1199 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.1200 + "pmaddubsw %%xmm4,%%xmm6 \n" 1.1201 + "phaddw %%xmm1,%%xmm0 \n" 1.1202 + "phaddw %%xmm6,%%xmm2 \n" 1.1203 + "psraw $0x8,%%xmm0 \n" 1.1204 + "psraw $0x8,%%xmm2 \n" 1.1205 + "packsswb %%xmm2,%%xmm0 \n" 1.1206 + "paddb %%xmm5,%%xmm0 \n" 1.1207 + "sub $0x10,%3 \n" 1.1208 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.1209 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.1210 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1211 + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1212 + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1.1213 + "pmaddubsw %%xmm3,%%xmm0 \n" 1.1214 + "pmaddubsw %%xmm3,%%xmm1 \n" 1.1215 + "pmaddubsw %%xmm3,%%xmm2 \n" 1.1216 + "pmaddubsw %%xmm3,%%xmm6 \n" 1.1217 + "phaddw %%xmm1,%%xmm0 \n" 1.1218 + "phaddw %%xmm6,%%xmm2 \n" 1.1219 + "psraw $0x8,%%xmm0 \n" 1.1220 + "psraw $0x8,%%xmm2 \n" 1.1221 + "packsswb %%xmm2,%%xmm0 \n" 1.1222 + "paddb %%xmm5,%%xmm0 \n" 1.1223 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.1224 + BUNDLEALIGN 1.1225 + MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1) 1.1226 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.1227 + "jg 1b \n" 1.1228 + : "+r"(src_argb), // %0 1.1229 + "+r"(dst_u), // %1 1.1230 + "+r"(dst_v), // %2 1.1231 + "+rm"(width) // %3 1.1232 + : 1.1233 + : "memory", "cc" 1.1234 +#if defined(__native_client__) && defined(__x86_64__) 1.1235 + , "r14" 1.1236 +#endif 1.1237 +#if defined(__SSE2__) 1.1238 + , "xmm0", "xmm1", "xmm2", "xmm6" 1.1239 +#endif 1.1240 + ); 1.1241 +} 1.1242 + 1.1243 +void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u, 1.1244 + uint8* dst_v, int width) { 1.1245 + asm volatile ( 1.1246 + "movdqa %0,%%xmm4 \n" 1.1247 + "movdqa %1,%%xmm3 \n" 1.1248 + "movdqa %2,%%xmm5 \n" 1.1249 + : 1.1250 + : "m"(kARGBToU), // %0 1.1251 + "m"(kARGBToV), // %1 1.1252 + "m"(kAddUV128) // %2 1.1253 + ); 1.1254 + asm volatile ( 1.1255 + "sub %1,%2 \n" 1.1256 + LABELALIGN 1.1257 + "1: \n" 1.1258 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.1259 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1260 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1261 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1.1262 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.1263 + "pmaddubsw %%xmm4,%%xmm1 \n" 1.1264 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.1265 + "pmaddubsw %%xmm4,%%xmm6 \n" 1.1266 + "phaddw %%xmm1,%%xmm0 \n" 1.1267 + "phaddw %%xmm6,%%xmm2 \n" 1.1268 + "psraw $0x8,%%xmm0 \n" 1.1269 + "psraw $0x8,%%xmm2 \n" 1.1270 + "packsswb %%xmm2,%%xmm0 \n" 1.1271 + "paddb %%xmm5,%%xmm0 \n" 1.1272 + "sub $0x10,%3 \n" 1.1273 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.1274 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.1275 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1276 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1277 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1.1278 + "pmaddubsw %%xmm3,%%xmm0 \n" 1.1279 + "pmaddubsw %%xmm3,%%xmm1 \n" 1.1280 + "pmaddubsw %%xmm3,%%xmm2 \n" 1.1281 + "pmaddubsw %%xmm3,%%xmm6 \n" 1.1282 + "phaddw %%xmm1,%%xmm0 \n" 1.1283 + "phaddw %%xmm6,%%xmm2 \n" 1.1284 + "psraw $0x8,%%xmm0 \n" 1.1285 + "psraw $0x8,%%xmm2 \n" 1.1286 + "packsswb %%xmm2,%%xmm0 \n" 1.1287 + "paddb %%xmm5,%%xmm0 \n" 1.1288 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.1289 + BUNDLEALIGN 1.1290 + MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) 1.1291 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.1292 + "jg 1b \n" 1.1293 + : "+r"(src_argb), // %0 1.1294 + "+r"(dst_u), // %1 1.1295 + "+r"(dst_v), // %2 1.1296 + "+rm"(width) // %3 1.1297 + : 1.1298 + : "memory", "cc" 1.1299 +#if defined(__native_client__) && defined(__x86_64__) 1.1300 + , "r14" 1.1301 +#endif 1.1302 +#if defined(__SSE2__) 1.1303 + , "xmm0", "xmm1", "xmm2", "xmm6" 1.1304 +#endif 1.1305 + ); 1.1306 +} 1.1307 + 1.1308 +void ARGBToUV422Row_SSSE3(const uint8* src_argb0, 1.1309 + uint8* dst_u, uint8* dst_v, int width) { 1.1310 + asm volatile ( 1.1311 + "movdqa %0,%%xmm4 \n" 1.1312 + "movdqa %1,%%xmm3 \n" 1.1313 + "movdqa %2,%%xmm5 \n" 1.1314 + : 1.1315 + : "m"(kARGBToU), // %0 1.1316 + "m"(kARGBToV), // %1 1.1317 + "m"(kAddUV128) // %2 1.1318 + ); 1.1319 + asm volatile ( 1.1320 + "sub %1,%2 \n" 1.1321 + LABELALIGN 1.1322 + "1: \n" 1.1323 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.1324 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1325 + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1326 + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1.1327 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.1328 + "movdqa %%xmm0,%%xmm7 \n" 1.1329 + "shufps $0x88,%%xmm1,%%xmm0 \n" 1.1330 + "shufps $0xdd,%%xmm1,%%xmm7 \n" 1.1331 + "pavgb %%xmm7,%%xmm0 \n" 1.1332 + "movdqa %%xmm2,%%xmm7 \n" 1.1333 + "shufps $0x88,%%xmm6,%%xmm2 \n" 1.1334 + "shufps $0xdd,%%xmm6,%%xmm7 \n" 1.1335 + "pavgb %%xmm7,%%xmm2 \n" 1.1336 + "movdqa %%xmm0,%%xmm1 \n" 1.1337 + "movdqa %%xmm2,%%xmm6 \n" 1.1338 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.1339 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.1340 + "pmaddubsw %%xmm3,%%xmm1 \n" 1.1341 + "pmaddubsw %%xmm3,%%xmm6 \n" 1.1342 + "phaddw %%xmm2,%%xmm0 \n" 1.1343 + "phaddw %%xmm6,%%xmm1 \n" 1.1344 + "psraw $0x8,%%xmm0 \n" 1.1345 + "psraw $0x8,%%xmm1 \n" 1.1346 + "packsswb %%xmm1,%%xmm0 \n" 1.1347 + "paddb %%xmm5,%%xmm0 \n" 1.1348 + "sub $0x10,%3 \n" 1.1349 + "movlps %%xmm0," MEMACCESS(1) " \n" 1.1350 + BUNDLEALIGN 1.1351 + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1.1352 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.1353 + "jg 1b \n" 1.1354 + : "+r"(src_argb0), // %0 1.1355 + "+r"(dst_u), // %1 1.1356 + "+r"(dst_v), // %2 1.1357 + "+rm"(width) // %3 1.1358 + : 1.1359 + : "memory", "cc" 1.1360 +#if defined(__native_client__) && defined(__x86_64__) 1.1361 + , "r14" 1.1362 +#endif 1.1363 +#if defined(__SSE2__) 1.1364 + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1.1365 +#endif 1.1366 + ); 1.1367 +} 1.1368 + 1.1369 +void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, 1.1370 + uint8* dst_u, uint8* dst_v, int width) { 1.1371 + asm volatile ( 1.1372 + "movdqa %0,%%xmm4 \n" 1.1373 + "movdqa %1,%%xmm3 \n" 1.1374 + "movdqa %2,%%xmm5 \n" 1.1375 + : 1.1376 + : "m"(kARGBToU), // %0 1.1377 + "m"(kARGBToV), // %1 1.1378 + "m"(kAddUV128) // %2 1.1379 + ); 1.1380 + asm volatile ( 1.1381 + "sub %1,%2 \n" 1.1382 + LABELALIGN 1.1383 + "1: \n" 1.1384 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.1385 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1386 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1387 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1.1388 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.1389 + "movdqa %%xmm0,%%xmm7 \n" 1.1390 + "shufps $0x88,%%xmm1,%%xmm0 \n" 1.1391 + "shufps $0xdd,%%xmm1,%%xmm7 \n" 1.1392 + "pavgb %%xmm7,%%xmm0 \n" 1.1393 + "movdqa %%xmm2,%%xmm7 \n" 1.1394 + "shufps $0x88,%%xmm6,%%xmm2 \n" 1.1395 + "shufps $0xdd,%%xmm6,%%xmm7 \n" 1.1396 + "pavgb %%xmm7,%%xmm2 \n" 1.1397 + "movdqa %%xmm0,%%xmm1 \n" 1.1398 + "movdqa %%xmm2,%%xmm6 \n" 1.1399 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.1400 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.1401 + "pmaddubsw %%xmm3,%%xmm1 \n" 1.1402 + "pmaddubsw %%xmm3,%%xmm6 \n" 1.1403 + "phaddw %%xmm2,%%xmm0 \n" 1.1404 + "phaddw %%xmm6,%%xmm1 \n" 1.1405 + "psraw $0x8,%%xmm0 \n" 1.1406 + "psraw $0x8,%%xmm1 \n" 1.1407 + "packsswb %%xmm1,%%xmm0 \n" 1.1408 + "paddb %%xmm5,%%xmm0 \n" 1.1409 + "sub $0x10,%3 \n" 1.1410 + "movlps %%xmm0," MEMACCESS(1) " \n" 1.1411 + BUNDLEALIGN 1.1412 + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1.1413 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.1414 + "jg 1b \n" 1.1415 + : "+r"(src_argb0), // %0 1.1416 + "+r"(dst_u), // %1 1.1417 + "+r"(dst_v), // %2 1.1418 + "+rm"(width) // %3 1.1419 + : 1.1420 + : "memory", "cc" 1.1421 +#if defined(__native_client__) && defined(__x86_64__) 1.1422 + , "r14" 1.1423 +#endif 1.1424 +#if defined(__SSE2__) 1.1425 + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1.1426 +#endif 1.1427 + ); 1.1428 +} 1.1429 + 1.1430 +void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { 1.1431 + asm volatile ( 1.1432 + "movdqa %4,%%xmm5 \n" 1.1433 + "movdqa %3,%%xmm4 \n" 1.1434 + LABELALIGN 1.1435 + "1: \n" 1.1436 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.1437 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1438 + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1439 + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1.1440 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.1441 + "pmaddubsw %%xmm4,%%xmm1 \n" 1.1442 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.1443 + "pmaddubsw %%xmm4,%%xmm3 \n" 1.1444 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.1445 + "phaddw %%xmm1,%%xmm0 \n" 1.1446 + "phaddw %%xmm3,%%xmm2 \n" 1.1447 + "psrlw $0x7,%%xmm0 \n" 1.1448 + "psrlw $0x7,%%xmm2 \n" 1.1449 + "packuswb %%xmm2,%%xmm0 \n" 1.1450 + "paddb %%xmm5,%%xmm0 \n" 1.1451 + "sub $0x10,%2 \n" 1.1452 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.1453 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.1454 + "jg 1b \n" 1.1455 + : "+r"(src_bgra), // %0 1.1456 + "+r"(dst_y), // %1 1.1457 + "+r"(pix) // %2 1.1458 + : "m"(kBGRAToY), // %3 1.1459 + "m"(kAddY16) // %4 1.1460 + : "memory", "cc" 1.1461 +#if defined(__SSE2__) 1.1462 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.1463 +#endif 1.1464 + ); 1.1465 +} 1.1466 + 1.1467 +void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { 1.1468 + asm volatile ( 1.1469 + "movdqa %4,%%xmm5 \n" 1.1470 + "movdqa %3,%%xmm4 \n" 1.1471 + LABELALIGN 1.1472 + "1: \n" 1.1473 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.1474 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1475 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1476 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1.1477 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.1478 + "pmaddubsw %%xmm4,%%xmm1 \n" 1.1479 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.1480 + "pmaddubsw %%xmm4,%%xmm3 \n" 1.1481 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.1482 + "phaddw %%xmm1,%%xmm0 \n" 1.1483 + "phaddw %%xmm3,%%xmm2 \n" 1.1484 + "psrlw $0x7,%%xmm0 \n" 1.1485 + "psrlw $0x7,%%xmm2 \n" 1.1486 + "packuswb %%xmm2,%%xmm0 \n" 1.1487 + "paddb %%xmm5,%%xmm0 \n" 1.1488 + "sub $0x10,%2 \n" 1.1489 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.1490 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.1491 + "jg 1b \n" 1.1492 + : "+r"(src_bgra), // %0 1.1493 + "+r"(dst_y), // %1 1.1494 + "+r"(pix) // %2 1.1495 + : "m"(kBGRAToY), // %3 1.1496 + "m"(kAddY16) // %4 1.1497 + : "memory", "cc" 1.1498 +#if defined(__SSE2__) 1.1499 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.1500 +#endif 1.1501 + ); 1.1502 +} 1.1503 + 1.1504 +void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, 1.1505 + uint8* dst_u, uint8* dst_v, int width) { 1.1506 + asm volatile ( 1.1507 + "movdqa %0,%%xmm4 \n" 1.1508 + "movdqa %1,%%xmm3 \n" 1.1509 + "movdqa %2,%%xmm5 \n" 1.1510 + : 1.1511 + : "m"(kBGRAToU), // %0 1.1512 + "m"(kBGRAToV), // %1 1.1513 + "m"(kAddUV128) // %2 1.1514 + ); 1.1515 + asm volatile ( 1.1516 + "sub %1,%2 \n" 1.1517 + LABELALIGN 1.1518 + "1: \n" 1.1519 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.1520 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1521 + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1522 + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1.1523 + BUNDLEALIGN 1.1524 + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 1.1525 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 1.1526 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 1.1527 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 1.1528 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.1529 + "movdqa %%xmm0,%%xmm7 \n" 1.1530 + "shufps $0x88,%%xmm1,%%xmm0 \n" 1.1531 + "shufps $0xdd,%%xmm1,%%xmm7 \n" 1.1532 + "pavgb %%xmm7,%%xmm0 \n" 1.1533 + "movdqa %%xmm2,%%xmm7 \n" 1.1534 + "shufps $0x88,%%xmm6,%%xmm2 \n" 1.1535 + "shufps $0xdd,%%xmm6,%%xmm7 \n" 1.1536 + "pavgb %%xmm7,%%xmm2 \n" 1.1537 + "movdqa %%xmm0,%%xmm1 \n" 1.1538 + "movdqa %%xmm2,%%xmm6 \n" 1.1539 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.1540 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.1541 + "pmaddubsw %%xmm3,%%xmm1 \n" 1.1542 + "pmaddubsw %%xmm3,%%xmm6 \n" 1.1543 + "phaddw %%xmm2,%%xmm0 \n" 1.1544 + "phaddw %%xmm6,%%xmm1 \n" 1.1545 + "psraw $0x8,%%xmm0 \n" 1.1546 + "psraw $0x8,%%xmm1 \n" 1.1547 + "packsswb %%xmm1,%%xmm0 \n" 1.1548 + "paddb %%xmm5,%%xmm0 \n" 1.1549 + "sub $0x10,%3 \n" 1.1550 + "movlps %%xmm0," MEMACCESS(1) " \n" 1.1551 + BUNDLEALIGN 1.1552 + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1.1553 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.1554 + "jg 1b \n" 1.1555 + : "+r"(src_bgra0), // %0 1.1556 + "+r"(dst_u), // %1 1.1557 + "+r"(dst_v), // %2 1.1558 + "+rm"(width) // %3 1.1559 + : "r"((intptr_t)(src_stride_bgra)) // %4 1.1560 + : "memory", "cc" 1.1561 +#if defined(__native_client__) && defined(__x86_64__) 1.1562 + , "r14" 1.1563 +#endif 1.1564 +#if defined(__SSE2__) 1.1565 + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1.1566 +#endif 1.1567 + ); 1.1568 +} 1.1569 + 1.1570 +void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra, 1.1571 + uint8* dst_u, uint8* dst_v, int width) { 1.1572 + asm volatile ( 1.1573 + "movdqa %0,%%xmm4 \n" 1.1574 + "movdqa %1,%%xmm3 \n" 1.1575 + "movdqa %2,%%xmm5 \n" 1.1576 + : 1.1577 + : "m"(kBGRAToU), // %0 1.1578 + "m"(kBGRAToV), // %1 1.1579 + "m"(kAddUV128) // %2 1.1580 + ); 1.1581 + asm volatile ( 1.1582 + "sub %1,%2 \n" 1.1583 + LABELALIGN 1.1584 + "1: \n" 1.1585 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.1586 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1587 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1588 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1.1589 + BUNDLEALIGN 1.1590 + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1.1591 + "pavgb %%xmm7,%%xmm0 \n" 1.1592 + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 1.1593 + "pavgb %%xmm7,%%xmm1 \n" 1.1594 + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 1.1595 + "pavgb %%xmm7,%%xmm2 \n" 1.1596 + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 1.1597 + "pavgb %%xmm7,%%xmm6 \n" 1.1598 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.1599 + "movdqa %%xmm0,%%xmm7 \n" 1.1600 + "shufps $0x88,%%xmm1,%%xmm0 \n" 1.1601 + "shufps $0xdd,%%xmm1,%%xmm7 \n" 1.1602 + "pavgb %%xmm7,%%xmm0 \n" 1.1603 + "movdqa %%xmm2,%%xmm7 \n" 1.1604 + "shufps $0x88,%%xmm6,%%xmm2 \n" 1.1605 + "shufps $0xdd,%%xmm6,%%xmm7 \n" 1.1606 + "pavgb %%xmm7,%%xmm2 \n" 1.1607 + "movdqa %%xmm0,%%xmm1 \n" 1.1608 + "movdqa %%xmm2,%%xmm6 \n" 1.1609 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.1610 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.1611 + "pmaddubsw %%xmm3,%%xmm1 \n" 1.1612 + "pmaddubsw %%xmm3,%%xmm6 \n" 1.1613 + "phaddw %%xmm2,%%xmm0 \n" 1.1614 + "phaddw %%xmm6,%%xmm1 \n" 1.1615 + "psraw $0x8,%%xmm0 \n" 1.1616 + "psraw $0x8,%%xmm1 \n" 1.1617 + "packsswb %%xmm1,%%xmm0 \n" 1.1618 + "paddb %%xmm5,%%xmm0 \n" 1.1619 + "sub $0x10,%3 \n" 1.1620 + "movlps %%xmm0," MEMACCESS(1) " \n" 1.1621 + BUNDLEALIGN 1.1622 + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1.1623 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.1624 + "jg 1b \n" 1.1625 + : "+r"(src_bgra0), // %0 1.1626 + "+r"(dst_u), // %1 1.1627 + "+r"(dst_v), // %2 1.1628 + "+rm"(width) // %3 1.1629 + : "r"((intptr_t)(src_stride_bgra)) // %4 1.1630 + : "memory", "cc" 1.1631 +#if defined(__native_client__) && defined(__x86_64__) 1.1632 + , "r14" 1.1633 +#endif 1.1634 +#if defined(__SSE2__) 1.1635 + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1.1636 +#endif 1.1637 + ); 1.1638 +} 1.1639 + 1.1640 +void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { 1.1641 + asm volatile ( 1.1642 + "movdqa %4,%%xmm5 \n" 1.1643 + "movdqa %3,%%xmm4 \n" 1.1644 + LABELALIGN 1.1645 + "1: \n" 1.1646 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.1647 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1648 + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1649 + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1.1650 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.1651 + "pmaddubsw %%xmm4,%%xmm1 \n" 1.1652 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.1653 + "pmaddubsw %%xmm4,%%xmm3 \n" 1.1654 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.1655 + "phaddw %%xmm1,%%xmm0 \n" 1.1656 + "phaddw %%xmm3,%%xmm2 \n" 1.1657 + "psrlw $0x7,%%xmm0 \n" 1.1658 + "psrlw $0x7,%%xmm2 \n" 1.1659 + "packuswb %%xmm2,%%xmm0 \n" 1.1660 + "paddb %%xmm5,%%xmm0 \n" 1.1661 + "sub $0x10,%2 \n" 1.1662 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.1663 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.1664 + "jg 1b \n" 1.1665 + : "+r"(src_abgr), // %0 1.1666 + "+r"(dst_y), // %1 1.1667 + "+r"(pix) // %2 1.1668 + : "m"(kABGRToY), // %3 1.1669 + "m"(kAddY16) // %4 1.1670 + : "memory", "cc" 1.1671 +#if defined(__SSE2__) 1.1672 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.1673 +#endif 1.1674 + ); 1.1675 +} 1.1676 + 1.1677 +void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { 1.1678 + asm volatile ( 1.1679 + "movdqa %4,%%xmm5 \n" 1.1680 + "movdqa %3,%%xmm4 \n" 1.1681 + LABELALIGN 1.1682 + "1: \n" 1.1683 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.1684 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1685 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1686 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1.1687 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.1688 + "pmaddubsw %%xmm4,%%xmm1 \n" 1.1689 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.1690 + "pmaddubsw %%xmm4,%%xmm3 \n" 1.1691 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.1692 + "phaddw %%xmm1,%%xmm0 \n" 1.1693 + "phaddw %%xmm3,%%xmm2 \n" 1.1694 + "psrlw $0x7,%%xmm0 \n" 1.1695 + "psrlw $0x7,%%xmm2 \n" 1.1696 + "packuswb %%xmm2,%%xmm0 \n" 1.1697 + "paddb %%xmm5,%%xmm0 \n" 1.1698 + "sub $0x10,%2 \n" 1.1699 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.1700 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.1701 + "jg 1b \n" 1.1702 + : "+r"(src_abgr), // %0 1.1703 + "+r"(dst_y), // %1 1.1704 + "+r"(pix) // %2 1.1705 + : "m"(kABGRToY), // %3 1.1706 + "m"(kAddY16) // %4 1.1707 + : "memory", "cc" 1.1708 +#if defined(__SSE2__) 1.1709 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.1710 +#endif 1.1711 + ); 1.1712 +} 1.1713 + 1.1714 +void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { 1.1715 + asm volatile ( 1.1716 + "movdqa %4,%%xmm5 \n" 1.1717 + "movdqa %3,%%xmm4 \n" 1.1718 + LABELALIGN 1.1719 + "1: \n" 1.1720 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.1721 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1722 + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1723 + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1.1724 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.1725 + "pmaddubsw %%xmm4,%%xmm1 \n" 1.1726 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.1727 + "pmaddubsw %%xmm4,%%xmm3 \n" 1.1728 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.1729 + "phaddw %%xmm1,%%xmm0 \n" 1.1730 + "phaddw %%xmm3,%%xmm2 \n" 1.1731 + "psrlw $0x7,%%xmm0 \n" 1.1732 + "psrlw $0x7,%%xmm2 \n" 1.1733 + "packuswb %%xmm2,%%xmm0 \n" 1.1734 + "paddb %%xmm5,%%xmm0 \n" 1.1735 + "sub $0x10,%2 \n" 1.1736 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.1737 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.1738 + "jg 1b \n" 1.1739 + : "+r"(src_rgba), // %0 1.1740 + "+r"(dst_y), // %1 1.1741 + "+r"(pix) // %2 1.1742 + : "m"(kRGBAToY), // %3 1.1743 + "m"(kAddY16) // %4 1.1744 + : "memory", "cc" 1.1745 +#if defined(__SSE2__) 1.1746 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.1747 +#endif 1.1748 + ); 1.1749 +} 1.1750 + 1.1751 +void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { 1.1752 + asm volatile ( 1.1753 + "movdqa %4,%%xmm5 \n" 1.1754 + "movdqa %3,%%xmm4 \n" 1.1755 + LABELALIGN 1.1756 + "1: \n" 1.1757 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.1758 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1759 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1760 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1.1761 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.1762 + "pmaddubsw %%xmm4,%%xmm1 \n" 1.1763 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.1764 + "pmaddubsw %%xmm4,%%xmm3 \n" 1.1765 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.1766 + "phaddw %%xmm1,%%xmm0 \n" 1.1767 + "phaddw %%xmm3,%%xmm2 \n" 1.1768 + "psrlw $0x7,%%xmm0 \n" 1.1769 + "psrlw $0x7,%%xmm2 \n" 1.1770 + "packuswb %%xmm2,%%xmm0 \n" 1.1771 + "paddb %%xmm5,%%xmm0 \n" 1.1772 + "sub $0x10,%2 \n" 1.1773 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.1774 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.1775 + "jg 1b \n" 1.1776 + : "+r"(src_rgba), // %0 1.1777 + "+r"(dst_y), // %1 1.1778 + "+r"(pix) // %2 1.1779 + : "m"(kRGBAToY), // %3 1.1780 + "m"(kAddY16) // %4 1.1781 + : "memory", "cc" 1.1782 +#if defined(__SSE2__) 1.1783 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.1784 +#endif 1.1785 + ); 1.1786 +} 1.1787 + 1.1788 +void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, 1.1789 + uint8* dst_u, uint8* dst_v, int width) { 1.1790 + asm volatile ( 1.1791 + "movdqa %0,%%xmm4 \n" 1.1792 + "movdqa %1,%%xmm3 \n" 1.1793 + "movdqa %2,%%xmm5 \n" 1.1794 + : 1.1795 + : "m"(kABGRToU), // %0 1.1796 + "m"(kABGRToV), // %1 1.1797 + "m"(kAddUV128) // %2 1.1798 + ); 1.1799 + asm volatile ( 1.1800 + "sub %1,%2 \n" 1.1801 + LABELALIGN 1.1802 + "1: \n" 1.1803 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.1804 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1805 + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1806 + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1.1807 + BUNDLEALIGN 1.1808 + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 1.1809 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 1.1810 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 1.1811 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 1.1812 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.1813 + "movdqa %%xmm0,%%xmm7 \n" 1.1814 + "shufps $0x88,%%xmm1,%%xmm0 \n" 1.1815 + "shufps $0xdd,%%xmm1,%%xmm7 \n" 1.1816 + "pavgb %%xmm7,%%xmm0 \n" 1.1817 + "movdqa %%xmm2,%%xmm7 \n" 1.1818 + "shufps $0x88,%%xmm6,%%xmm2 \n" 1.1819 + "shufps $0xdd,%%xmm6,%%xmm7 \n" 1.1820 + "pavgb %%xmm7,%%xmm2 \n" 1.1821 + "movdqa %%xmm0,%%xmm1 \n" 1.1822 + "movdqa %%xmm2,%%xmm6 \n" 1.1823 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.1824 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.1825 + "pmaddubsw %%xmm3,%%xmm1 \n" 1.1826 + "pmaddubsw %%xmm3,%%xmm6 \n" 1.1827 + "phaddw %%xmm2,%%xmm0 \n" 1.1828 + "phaddw %%xmm6,%%xmm1 \n" 1.1829 + "psraw $0x8,%%xmm0 \n" 1.1830 + "psraw $0x8,%%xmm1 \n" 1.1831 + "packsswb %%xmm1,%%xmm0 \n" 1.1832 + "paddb %%xmm5,%%xmm0 \n" 1.1833 + "sub $0x10,%3 \n" 1.1834 + "movlps %%xmm0," MEMACCESS(1) " \n" 1.1835 + BUNDLEALIGN 1.1836 + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1.1837 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.1838 + "jg 1b \n" 1.1839 + : "+r"(src_abgr0), // %0 1.1840 + "+r"(dst_u), // %1 1.1841 + "+r"(dst_v), // %2 1.1842 + "+rm"(width) // %3 1.1843 + : "r"((intptr_t)(src_stride_abgr)) // %4 1.1844 + : "memory", "cc" 1.1845 +#if defined(__native_client__) && defined(__x86_64__) 1.1846 + , "r14" 1.1847 +#endif 1.1848 +#if defined(__SSE2__) 1.1849 + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1.1850 +#endif 1.1851 + ); 1.1852 +} 1.1853 + 1.1854 +void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, 1.1855 + uint8* dst_u, uint8* dst_v, int width) { 1.1856 + asm volatile ( 1.1857 + "movdqa %0,%%xmm4 \n" 1.1858 + "movdqa %1,%%xmm3 \n" 1.1859 + "movdqa %2,%%xmm5 \n" 1.1860 + : 1.1861 + : "m"(kABGRToU), // %0 1.1862 + "m"(kABGRToV), // %1 1.1863 + "m"(kAddUV128) // %2 1.1864 + ); 1.1865 + asm volatile ( 1.1866 + "sub %1,%2 \n" 1.1867 + LABELALIGN 1.1868 + "1: \n" 1.1869 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.1870 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1871 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1872 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1.1873 + BUNDLEALIGN 1.1874 + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1.1875 + "pavgb %%xmm7,%%xmm0 \n" 1.1876 + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 1.1877 + "pavgb %%xmm7,%%xmm1 \n" 1.1878 + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 1.1879 + "pavgb %%xmm7,%%xmm2 \n" 1.1880 + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 1.1881 + "pavgb %%xmm7,%%xmm6 \n" 1.1882 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.1883 + "movdqa %%xmm0,%%xmm7 \n" 1.1884 + "shufps $0x88,%%xmm1,%%xmm0 \n" 1.1885 + "shufps $0xdd,%%xmm1,%%xmm7 \n" 1.1886 + "pavgb %%xmm7,%%xmm0 \n" 1.1887 + "movdqa %%xmm2,%%xmm7 \n" 1.1888 + "shufps $0x88,%%xmm6,%%xmm2 \n" 1.1889 + "shufps $0xdd,%%xmm6,%%xmm7 \n" 1.1890 + "pavgb %%xmm7,%%xmm2 \n" 1.1891 + "movdqa %%xmm0,%%xmm1 \n" 1.1892 + "movdqa %%xmm2,%%xmm6 \n" 1.1893 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.1894 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.1895 + "pmaddubsw %%xmm3,%%xmm1 \n" 1.1896 + "pmaddubsw %%xmm3,%%xmm6 \n" 1.1897 + "phaddw %%xmm2,%%xmm0 \n" 1.1898 + "phaddw %%xmm6,%%xmm1 \n" 1.1899 + "psraw $0x8,%%xmm0 \n" 1.1900 + "psraw $0x8,%%xmm1 \n" 1.1901 + "packsswb %%xmm1,%%xmm0 \n" 1.1902 + "paddb %%xmm5,%%xmm0 \n" 1.1903 + "sub $0x10,%3 \n" 1.1904 + "movlps %%xmm0," MEMACCESS(1) " \n" 1.1905 + BUNDLEALIGN 1.1906 + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1.1907 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.1908 + "jg 1b \n" 1.1909 + : "+r"(src_abgr0), // %0 1.1910 + "+r"(dst_u), // %1 1.1911 + "+r"(dst_v), // %2 1.1912 + "+rm"(width) // %3 1.1913 + : "r"((intptr_t)(src_stride_abgr)) // %4 1.1914 + : "memory", "cc" 1.1915 +#if defined(__native_client__) && defined(__x86_64__) 1.1916 + , "r14" 1.1917 +#endif 1.1918 +#if defined(__SSE2__) 1.1919 + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1.1920 +#endif 1.1921 + ); 1.1922 +} 1.1923 + 1.1924 +void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, 1.1925 + uint8* dst_u, uint8* dst_v, int width) { 1.1926 + asm volatile ( 1.1927 + "movdqa %0,%%xmm4 \n" 1.1928 + "movdqa %1,%%xmm3 \n" 1.1929 + "movdqa %2,%%xmm5 \n" 1.1930 + : 1.1931 + : "m"(kRGBAToU), // %0 1.1932 + "m"(kRGBAToV), // %1 1.1933 + "m"(kAddUV128) // %2 1.1934 + ); 1.1935 + asm volatile ( 1.1936 + "sub %1,%2 \n" 1.1937 + LABELALIGN 1.1938 + "1: \n" 1.1939 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.1940 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.1941 + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.1942 + "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1.1943 + BUNDLEALIGN 1.1944 + MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 1.1945 + MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 1.1946 + MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 1.1947 + MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 1.1948 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.1949 + "movdqa %%xmm0,%%xmm7 \n" 1.1950 + "shufps $0x88,%%xmm1,%%xmm0 \n" 1.1951 + "shufps $0xdd,%%xmm1,%%xmm7 \n" 1.1952 + "pavgb %%xmm7,%%xmm0 \n" 1.1953 + "movdqa %%xmm2,%%xmm7 \n" 1.1954 + "shufps $0x88,%%xmm6,%%xmm2 \n" 1.1955 + "shufps $0xdd,%%xmm6,%%xmm7 \n" 1.1956 + "pavgb %%xmm7,%%xmm2 \n" 1.1957 + "movdqa %%xmm0,%%xmm1 \n" 1.1958 + "movdqa %%xmm2,%%xmm6 \n" 1.1959 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.1960 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.1961 + "pmaddubsw %%xmm3,%%xmm1 \n" 1.1962 + "pmaddubsw %%xmm3,%%xmm6 \n" 1.1963 + "phaddw %%xmm2,%%xmm0 \n" 1.1964 + "phaddw %%xmm6,%%xmm1 \n" 1.1965 + "psraw $0x8,%%xmm0 \n" 1.1966 + "psraw $0x8,%%xmm1 \n" 1.1967 + "packsswb %%xmm1,%%xmm0 \n" 1.1968 + "paddb %%xmm5,%%xmm0 \n" 1.1969 + "sub $0x10,%3 \n" 1.1970 + "movlps %%xmm0," MEMACCESS(1) " \n" 1.1971 + BUNDLEALIGN 1.1972 + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1.1973 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.1974 + "jg 1b \n" 1.1975 + : "+r"(src_rgba0), // %0 1.1976 + "+r"(dst_u), // %1 1.1977 + "+r"(dst_v), // %2 1.1978 + "+rm"(width) // %3 1.1979 + : "r"((intptr_t)(src_stride_rgba)) 1.1980 + : "memory", "cc" 1.1981 +#if defined(__native_client__) && defined(__x86_64__) 1.1982 + , "r14" 1.1983 +#endif 1.1984 +#if defined(__SSE2__) 1.1985 + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1.1986 +#endif 1.1987 + ); 1.1988 +} 1.1989 + 1.1990 +void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba, 1.1991 + uint8* dst_u, uint8* dst_v, int width) { 1.1992 + asm volatile ( 1.1993 + "movdqa %0,%%xmm4 \n" 1.1994 + "movdqa %1,%%xmm3 \n" 1.1995 + "movdqa %2,%%xmm5 \n" 1.1996 + : 1.1997 + : "m"(kRGBAToU), // %0 1.1998 + "m"(kRGBAToV), // %1 1.1999 + "m"(kAddUV128) // %2 1.2000 + ); 1.2001 + asm volatile ( 1.2002 + "sub %1,%2 \n" 1.2003 + LABELALIGN 1.2004 + "1: \n" 1.2005 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.2006 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.2007 + "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.2008 + "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" 1.2009 + BUNDLEALIGN 1.2010 + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 1.2011 + "pavgb %%xmm7,%%xmm0 \n" 1.2012 + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 1.2013 + "pavgb %%xmm7,%%xmm1 \n" 1.2014 + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 1.2015 + "pavgb %%xmm7,%%xmm2 \n" 1.2016 + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 1.2017 + "pavgb %%xmm7,%%xmm6 \n" 1.2018 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.2019 + "movdqa %%xmm0,%%xmm7 \n" 1.2020 + "shufps $0x88,%%xmm1,%%xmm0 \n" 1.2021 + "shufps $0xdd,%%xmm1,%%xmm7 \n" 1.2022 + "pavgb %%xmm7,%%xmm0 \n" 1.2023 + "movdqa %%xmm2,%%xmm7 \n" 1.2024 + "shufps $0x88,%%xmm6,%%xmm2 \n" 1.2025 + "shufps $0xdd,%%xmm6,%%xmm7 \n" 1.2026 + "pavgb %%xmm7,%%xmm2 \n" 1.2027 + "movdqa %%xmm0,%%xmm1 \n" 1.2028 + "movdqa %%xmm2,%%xmm6 \n" 1.2029 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.2030 + "pmaddubsw %%xmm4,%%xmm2 \n" 1.2031 + "pmaddubsw %%xmm3,%%xmm1 \n" 1.2032 + "pmaddubsw %%xmm3,%%xmm6 \n" 1.2033 + "phaddw %%xmm2,%%xmm0 \n" 1.2034 + "phaddw %%xmm6,%%xmm1 \n" 1.2035 + "psraw $0x8,%%xmm0 \n" 1.2036 + "psraw $0x8,%%xmm1 \n" 1.2037 + "packsswb %%xmm1,%%xmm0 \n" 1.2038 + "paddb %%xmm5,%%xmm0 \n" 1.2039 + "sub $0x10,%3 \n" 1.2040 + "movlps %%xmm0," MEMACCESS(1) " \n" 1.2041 + BUNDLEALIGN 1.2042 + MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) 1.2043 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.2044 + "jg 1b \n" 1.2045 + : "+r"(src_rgba0), // %0 1.2046 + "+r"(dst_u), // %1 1.2047 + "+r"(dst_v), // %2 1.2048 + "+rm"(width) // %3 1.2049 + : "r"((intptr_t)(src_stride_rgba)) // %4 1.2050 + : "memory", "cc" 1.2051 +#if defined(__native_client__) && defined(__x86_64__) 1.2052 + , "r14" 1.2053 +#endif 1.2054 +#if defined(__SSE2__) 1.2055 + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" 1.2056 +#endif 1.2057 + ); 1.2058 +} 1.2059 +#endif // HAS_ARGBTOUVROW_SSSE3 1.2060 + 1.2061 +#ifdef HAS_I422TOARGBROW_SSSE3 1.2062 +#define UB 127 /* min(63,(int8)(2.018 * 64)) */ 1.2063 +#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ 1.2064 +#define UR 0 1.2065 + 1.2066 +#define VB 0 1.2067 +#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ 1.2068 +#define VR 102 /* (int8)(1.596 * 64 + 0.5) */ 1.2069 + 1.2070 +// Bias 1.2071 +#define BB UB * 128 + VB * 128 1.2072 +#define BG UG * 128 + VG * 128 1.2073 +#define BR UR * 128 + VR * 128 1.2074 + 1.2075 +#define YG 74 /* (int8)(1.164 * 64 + 0.5) */ 1.2076 + 1.2077 +struct { 1.2078 + vec8 kUVToB; // 0 1.2079 + vec8 kUVToG; // 16 1.2080 + vec8 kUVToR; // 32 1.2081 + vec16 kUVBiasB; // 48 1.2082 + vec16 kUVBiasG; // 64 1.2083 + vec16 kUVBiasR; // 80 1.2084 + vec16 kYSub16; // 96 1.2085 + vec16 kYToRgb; // 112 1.2086 + vec8 kVUToB; // 128 1.2087 + vec8 kVUToG; // 144 1.2088 + vec8 kVUToR; // 160 1.2089 +} static SIMD_ALIGNED(kYuvConstants) = { 1.2090 + { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB }, 1.2091 + { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, 1.2092 + { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR }, 1.2093 + { BB, BB, BB, BB, BB, BB, BB, BB }, 1.2094 + { BG, BG, BG, BG, BG, BG, BG, BG }, 1.2095 + { BR, BR, BR, BR, BR, BR, BR, BR }, 1.2096 + { 16, 16, 16, 16, 16, 16, 16, 16 }, 1.2097 + { YG, YG, YG, YG, YG, YG, YG, YG }, 1.2098 + { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB }, 1.2099 + { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, 1.2100 + { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR } 1.2101 +}; 1.2102 + 1.2103 + 1.2104 +// Read 8 UV from 411 1.2105 +#define READYUV444 \ 1.2106 + "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1.2107 + BUNDLEALIGN \ 1.2108 + MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1.2109 + "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ 1.2110 + "punpcklbw %%xmm1,%%xmm0 \n" 1.2111 + 1.2112 +// Read 4 UV from 422, upsample to 8 UV 1.2113 +#define READYUV422 \ 1.2114 + "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1.2115 + BUNDLEALIGN \ 1.2116 + MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1.2117 + "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ 1.2118 + "punpcklbw %%xmm1,%%xmm0 \n" \ 1.2119 + "punpcklwd %%xmm0,%%xmm0 \n" 1.2120 + 1.2121 +// Read 2 UV from 411, upsample to 8 UV 1.2122 +#define READYUV411 \ 1.2123 + "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ 1.2124 + BUNDLEALIGN \ 1.2125 + MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ 1.2126 + "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ 1.2127 + "punpcklbw %%xmm1,%%xmm0 \n" \ 1.2128 + "punpcklwd %%xmm0,%%xmm0 \n" \ 1.2129 + "punpckldq %%xmm0,%%xmm0 \n" 1.2130 + 1.2131 +// Read 4 UV from NV12, upsample to 8 UV 1.2132 +#define READNV12 \ 1.2133 + "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ 1.2134 + "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ 1.2135 + "punpcklwd %%xmm0,%%xmm0 \n" 1.2136 + 1.2137 +// Convert 8 pixels: 8 UV and 8 Y 1.2138 +#define YUVTORGB \ 1.2139 + "movdqa %%xmm0,%%xmm1 \n" \ 1.2140 + "movdqa %%xmm0,%%xmm2 \n" \ 1.2141 + "pmaddubsw " MEMACCESS([kYuvConstants]) ",%%xmm0 \n" \ 1.2142 + "pmaddubsw " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1 \n" \ 1.2143 + "pmaddubsw " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2 \n" \ 1.2144 + "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \ 1.2145 + "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \ 1.2146 + "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \ 1.2147 + "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ 1.2148 + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ 1.2149 + "punpcklbw %%xmm4,%%xmm3 \n" \ 1.2150 + "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ 1.2151 + "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \ 1.2152 + "paddsw %%xmm3,%%xmm0 \n" \ 1.2153 + "paddsw %%xmm3,%%xmm1 \n" \ 1.2154 + "paddsw %%xmm3,%%xmm2 \n" \ 1.2155 + "psraw $0x6,%%xmm0 \n" \ 1.2156 + "psraw $0x6,%%xmm1 \n" \ 1.2157 + "psraw $0x6,%%xmm2 \n" \ 1.2158 + "packuswb %%xmm0,%%xmm0 \n" \ 1.2159 + "packuswb %%xmm1,%%xmm1 \n" \ 1.2160 + "packuswb %%xmm2,%%xmm2 \n" 1.2161 + 1.2162 +// Convert 8 pixels: 8 VU and 8 Y 1.2163 +#define YVUTORGB \ 1.2164 + "movdqa %%xmm0,%%xmm1 \n" \ 1.2165 + "movdqa %%xmm0,%%xmm2 \n" \ 1.2166 + "pmaddubsw " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0 \n" \ 1.2167 + "pmaddubsw " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1 \n" \ 1.2168 + "pmaddubsw " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2 \n" \ 1.2169 + "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \ 1.2170 + "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \ 1.2171 + "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \ 1.2172 + "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ 1.2173 + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ 1.2174 + "punpcklbw %%xmm4,%%xmm3 \n" \ 1.2175 + "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ 1.2176 + "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \ 1.2177 + "paddsw %%xmm3,%%xmm0 \n" \ 1.2178 + "paddsw %%xmm3,%%xmm1 \n" \ 1.2179 + "paddsw %%xmm3,%%xmm2 \n" \ 1.2180 + "psraw $0x6,%%xmm0 \n" \ 1.2181 + "psraw $0x6,%%xmm1 \n" \ 1.2182 + "psraw $0x6,%%xmm2 \n" \ 1.2183 + "packuswb %%xmm0,%%xmm0 \n" \ 1.2184 + "packuswb %%xmm1,%%xmm1 \n" \ 1.2185 + "packuswb %%xmm2,%%xmm2 \n" 1.2186 + 1.2187 +void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, 1.2188 + const uint8* u_buf, 1.2189 + const uint8* v_buf, 1.2190 + uint8* dst_argb, 1.2191 + int width) { 1.2192 + asm volatile ( 1.2193 + "sub %[u_buf],%[v_buf] \n" 1.2194 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2195 + "pxor %%xmm4,%%xmm4 \n" 1.2196 + LABELALIGN 1.2197 + "1: \n" 1.2198 + READYUV444 1.2199 + YUVTORGB 1.2200 + "punpcklbw %%xmm1,%%xmm0 \n" 1.2201 + "punpcklbw %%xmm5,%%xmm2 \n" 1.2202 + "movdqa %%xmm0,%%xmm1 \n" 1.2203 + "punpcklwd %%xmm2,%%xmm0 \n" 1.2204 + "punpckhwd %%xmm2,%%xmm1 \n" 1.2205 + "movdqa %%xmm0," MEMACCESS([dst_argb]) " \n" 1.2206 + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" 1.2207 + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 1.2208 + "sub $0x8,%[width] \n" 1.2209 + "jg 1b \n" 1.2210 + : [y_buf]"+r"(y_buf), // %[y_buf] 1.2211 + [u_buf]"+r"(u_buf), // %[u_buf] 1.2212 + [v_buf]"+r"(v_buf), // %[v_buf] 1.2213 + [dst_argb]"+r"(dst_argb), // %[dst_argb] 1.2214 + [width]"+rm"(width) // %[width] 1.2215 + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1.2216 + : "memory", "cc" 1.2217 +#if defined(__native_client__) && defined(__x86_64__) 1.2218 + , "r14" 1.2219 +#endif 1.2220 +#if defined(__SSE2__) 1.2221 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.2222 +#endif 1.2223 + ); 1.2224 +} 1.2225 + 1.2226 +void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, 1.2227 + const uint8* u_buf, 1.2228 + const uint8* v_buf, 1.2229 + uint8* dst_rgb24, 1.2230 + int width) { 1.2231 +// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs. 1.2232 +#if defined(__i386__) 1.2233 + asm volatile ( 1.2234 + "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" 1.2235 + "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" 1.2236 + :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), 1.2237 + [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)); 1.2238 +#endif 1.2239 + 1.2240 + asm volatile ( 1.2241 +#if !defined(__i386__) 1.2242 + "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" 1.2243 + "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" 1.2244 +#endif 1.2245 + "sub %[u_buf],%[v_buf] \n" 1.2246 + "pxor %%xmm4,%%xmm4 \n" 1.2247 + LABELALIGN 1.2248 + "1: \n" 1.2249 + READYUV422 1.2250 + YUVTORGB 1.2251 + "punpcklbw %%xmm1,%%xmm0 \n" 1.2252 + "punpcklbw %%xmm2,%%xmm2 \n" 1.2253 + "movdqa %%xmm0,%%xmm1 \n" 1.2254 + "punpcklwd %%xmm2,%%xmm0 \n" 1.2255 + "punpckhwd %%xmm2,%%xmm1 \n" 1.2256 + "pshufb %%xmm5,%%xmm0 \n" 1.2257 + "pshufb %%xmm6,%%xmm1 \n" 1.2258 + "palignr $0xc,%%xmm0,%%xmm1 \n" 1.2259 + "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" 1.2260 + "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" 1.2261 + "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" 1.2262 + "sub $0x8,%[width] \n" 1.2263 + "jg 1b \n" 1.2264 + : [y_buf]"+r"(y_buf), // %[y_buf] 1.2265 + [u_buf]"+r"(u_buf), // %[u_buf] 1.2266 + [v_buf]"+r"(v_buf), // %[v_buf] 1.2267 + [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] 1.2268 + [width]"+rm"(width) // %[width] 1.2269 + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) 1.2270 +#if !defined(__i386__) 1.2271 + , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), 1.2272 + [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) 1.2273 +#endif 1.2274 + : "memory", "cc" 1.2275 +#if defined(__native_client__) && defined(__x86_64__) 1.2276 + , "r14" 1.2277 +#endif 1.2278 +#if defined(__SSE2__) 1.2279 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 1.2280 +#endif 1.2281 + ); 1.2282 +} 1.2283 + 1.2284 +void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, 1.2285 + const uint8* u_buf, 1.2286 + const uint8* v_buf, 1.2287 + uint8* dst_raw, 1.2288 + int width) { 1.2289 +// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs. 1.2290 +#if defined(__i386__) 1.2291 + asm volatile ( 1.2292 + "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" 1.2293 + "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" 1.2294 + :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), 1.2295 + [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)); 1.2296 +#endif 1.2297 + 1.2298 + asm volatile ( 1.2299 +#if !defined(__i386__) 1.2300 + "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" 1.2301 + "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" 1.2302 +#endif 1.2303 + "sub %[u_buf],%[v_buf] \n" 1.2304 + "pxor %%xmm4,%%xmm4 \n" 1.2305 + LABELALIGN 1.2306 + "1: \n" 1.2307 + READYUV422 1.2308 + YUVTORGB 1.2309 + "punpcklbw %%xmm1,%%xmm0 \n" 1.2310 + "punpcklbw %%xmm2,%%xmm2 \n" 1.2311 + "movdqa %%xmm0,%%xmm1 \n" 1.2312 + "punpcklwd %%xmm2,%%xmm0 \n" 1.2313 + "punpckhwd %%xmm2,%%xmm1 \n" 1.2314 + "pshufb %%xmm5,%%xmm0 \n" 1.2315 + "pshufb %%xmm6,%%xmm1 \n" 1.2316 + "palignr $0xc,%%xmm0,%%xmm1 \n" 1.2317 + "movq %%xmm0," MEMACCESS([dst_raw]) " \n" 1.2318 + "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n" 1.2319 + "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n" 1.2320 + "sub $0x8,%[width] \n" 1.2321 + "jg 1b \n" 1.2322 + : [y_buf]"+r"(y_buf), // %[y_buf] 1.2323 + [u_buf]"+r"(u_buf), // %[u_buf] 1.2324 + [v_buf]"+r"(v_buf), // %[v_buf] 1.2325 + [dst_raw]"+r"(dst_raw), // %[dst_raw] 1.2326 + [width]"+rm"(width) // %[width] 1.2327 + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) 1.2328 +#if !defined(__i386__) 1.2329 + , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), 1.2330 + [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW) 1.2331 +#endif 1.2332 + : "memory", "cc" 1.2333 +#if defined(__native_client__) && defined(__x86_64__) 1.2334 + , "r14" 1.2335 +#endif 1.2336 +#if defined(__SSE2__) 1.2337 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 1.2338 +#endif 1.2339 + ); 1.2340 +} 1.2341 + 1.2342 +void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, 1.2343 + const uint8* u_buf, 1.2344 + const uint8* v_buf, 1.2345 + uint8* dst_argb, 1.2346 + int width) { 1.2347 + asm volatile ( 1.2348 + "sub %[u_buf],%[v_buf] \n" 1.2349 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2350 + "pxor %%xmm4,%%xmm4 \n" 1.2351 + LABELALIGN 1.2352 + "1: \n" 1.2353 + READYUV422 1.2354 + YUVTORGB 1.2355 + "punpcklbw %%xmm1,%%xmm0 \n" 1.2356 + "punpcklbw %%xmm5,%%xmm2 \n" 1.2357 + "movdqa %%xmm0,%%xmm1 \n" 1.2358 + "punpcklwd %%xmm2,%%xmm0 \n" 1.2359 + "punpckhwd %%xmm2,%%xmm1 \n" 1.2360 + "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" 1.2361 + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" 1.2362 + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 1.2363 + "sub $0x8,%[width] \n" 1.2364 + "jg 1b \n" 1.2365 + : [y_buf]"+r"(y_buf), // %[y_buf] 1.2366 + [u_buf]"+r"(u_buf), // %[u_buf] 1.2367 + [v_buf]"+r"(v_buf), // %[v_buf] 1.2368 + [dst_argb]"+r"(dst_argb), // %[dst_argb] 1.2369 + [width]"+rm"(width) // %[width] 1.2370 + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1.2371 + : "memory", "cc" 1.2372 +#if defined(__native_client__) && defined(__x86_64__) 1.2373 + , "r14" 1.2374 +#endif 1.2375 +#if defined(__SSE2__) 1.2376 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.2377 +#endif 1.2378 + ); 1.2379 +} 1.2380 + 1.2381 +void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, 1.2382 + const uint8* u_buf, 1.2383 + const uint8* v_buf, 1.2384 + uint8* dst_argb, 1.2385 + int width) { 1.2386 + asm volatile ( 1.2387 + "sub %[u_buf],%[v_buf] \n" 1.2388 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2389 + "pxor %%xmm4,%%xmm4 \n" 1.2390 + LABELALIGN 1.2391 + "1: \n" 1.2392 + READYUV411 1.2393 + YUVTORGB 1.2394 + "punpcklbw %%xmm1,%%xmm0 \n" 1.2395 + "punpcklbw %%xmm5,%%xmm2 \n" 1.2396 + "movdqa %%xmm0,%%xmm1 \n" 1.2397 + "punpcklwd %%xmm2,%%xmm0 \n" 1.2398 + "punpckhwd %%xmm2,%%xmm1 \n" 1.2399 + "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" 1.2400 + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" 1.2401 + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 1.2402 + "sub $0x8,%[width] \n" 1.2403 + "jg 1b \n" 1.2404 + : [y_buf]"+r"(y_buf), // %[y_buf] 1.2405 + [u_buf]"+r"(u_buf), // %[u_buf] 1.2406 + [v_buf]"+r"(v_buf), // %[v_buf] 1.2407 + [dst_argb]"+r"(dst_argb), // %[dst_argb] 1.2408 + [width]"+rm"(width) // %[width] 1.2409 + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1.2410 + : "memory", "cc" 1.2411 +#if defined(__native_client__) && defined(__x86_64__) 1.2412 + , "r14" 1.2413 +#endif 1.2414 +#if defined(__SSE2__) 1.2415 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.2416 +#endif 1.2417 + ); 1.2418 +} 1.2419 + 1.2420 +void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, 1.2421 + const uint8* uv_buf, 1.2422 + uint8* dst_argb, 1.2423 + int width) { 1.2424 + asm volatile ( 1.2425 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2426 + "pxor %%xmm4,%%xmm4 \n" 1.2427 + LABELALIGN 1.2428 + "1: \n" 1.2429 + READNV12 1.2430 + YUVTORGB 1.2431 + "punpcklbw %%xmm1,%%xmm0 \n" 1.2432 + "punpcklbw %%xmm5,%%xmm2 \n" 1.2433 + "movdqa %%xmm0,%%xmm1 \n" 1.2434 + "punpcklwd %%xmm2,%%xmm0 \n" 1.2435 + "punpckhwd %%xmm2,%%xmm1 \n" 1.2436 + "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" 1.2437 + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" 1.2438 + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 1.2439 + "sub $0x8,%[width] \n" 1.2440 + "jg 1b \n" 1.2441 + : [y_buf]"+r"(y_buf), // %[y_buf] 1.2442 + [uv_buf]"+r"(uv_buf), // %[uv_buf] 1.2443 + [dst_argb]"+r"(dst_argb), // %[dst_argb] 1.2444 + [width]"+rm"(width) // %[width] 1.2445 + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1.2446 + : "memory", "cc" 1.2447 + // Does not use r14. 1.2448 +#if defined(__SSE2__) 1.2449 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.2450 +#endif 1.2451 + ); 1.2452 +} 1.2453 + 1.2454 +void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, 1.2455 + const uint8* uv_buf, 1.2456 + uint8* dst_argb, 1.2457 + int width) { 1.2458 + asm volatile ( 1.2459 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2460 + "pxor %%xmm4,%%xmm4 \n" 1.2461 + LABELALIGN 1.2462 + "1: \n" 1.2463 + READNV12 1.2464 + YVUTORGB 1.2465 + "punpcklbw %%xmm1,%%xmm0 \n" 1.2466 + "punpcklbw %%xmm5,%%xmm2 \n" 1.2467 + "movdqa %%xmm0,%%xmm1 \n" 1.2468 + "punpcklwd %%xmm2,%%xmm0 \n" 1.2469 + "punpckhwd %%xmm2,%%xmm1 \n" 1.2470 + "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" 1.2471 + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" 1.2472 + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 1.2473 + "sub $0x8,%[width] \n" 1.2474 + "jg 1b \n" 1.2475 + : [y_buf]"+r"(y_buf), // %[y_buf] 1.2476 + [uv_buf]"+r"(uv_buf), // %[uv_buf] 1.2477 + [dst_argb]"+r"(dst_argb), // %[dst_argb] 1.2478 + [width]"+rm"(width) // %[width] 1.2479 + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1.2480 + : "memory", "cc" 1.2481 + // Does not use r14. 1.2482 +#if defined(__SSE2__) 1.2483 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.2484 +#endif 1.2485 + ); 1.2486 +} 1.2487 + 1.2488 +void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1.2489 + const uint8* u_buf, 1.2490 + const uint8* v_buf, 1.2491 + uint8* dst_argb, 1.2492 + int width) { 1.2493 + asm volatile ( 1.2494 + "sub %[u_buf],%[v_buf] \n" 1.2495 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2496 + "pxor %%xmm4,%%xmm4 \n" 1.2497 + LABELALIGN 1.2498 + "1: \n" 1.2499 + READYUV444 1.2500 + YUVTORGB 1.2501 + "punpcklbw %%xmm1,%%xmm0 \n" 1.2502 + "punpcklbw %%xmm5,%%xmm2 \n" 1.2503 + "movdqa %%xmm0,%%xmm1 \n" 1.2504 + "punpcklwd %%xmm2,%%xmm0 \n" 1.2505 + "punpckhwd %%xmm2,%%xmm1 \n" 1.2506 + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" 1.2507 + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" 1.2508 + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 1.2509 + "sub $0x8,%[width] \n" 1.2510 + "jg 1b \n" 1.2511 + : [y_buf]"+r"(y_buf), // %[y_buf] 1.2512 + [u_buf]"+r"(u_buf), // %[u_buf] 1.2513 + [v_buf]"+r"(v_buf), // %[v_buf] 1.2514 + [dst_argb]"+r"(dst_argb), // %[dst_argb] 1.2515 + [width]"+rm"(width) // %[width] 1.2516 + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1.2517 + : "memory", "cc" 1.2518 +#if defined(__native_client__) && defined(__x86_64__) 1.2519 + , "r14" 1.2520 +#endif 1.2521 +#if defined(__SSE2__) 1.2522 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.2523 +#endif 1.2524 + ); 1.2525 +} 1.2526 + 1.2527 +void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1.2528 + const uint8* u_buf, 1.2529 + const uint8* v_buf, 1.2530 + uint8* dst_argb, 1.2531 + int width) { 1.2532 + asm volatile ( 1.2533 + "sub %[u_buf],%[v_buf] \n" 1.2534 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2535 + "pxor %%xmm4,%%xmm4 \n" 1.2536 + LABELALIGN 1.2537 + "1: \n" 1.2538 + READYUV422 1.2539 + YUVTORGB 1.2540 + "punpcklbw %%xmm1,%%xmm0 \n" 1.2541 + "punpcklbw %%xmm5,%%xmm2 \n" 1.2542 + "movdqa %%xmm0,%%xmm1 \n" 1.2543 + "punpcklwd %%xmm2,%%xmm0 \n" 1.2544 + "punpckhwd %%xmm2,%%xmm1 \n" 1.2545 + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" 1.2546 + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" 1.2547 + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 1.2548 + "sub $0x8,%[width] \n" 1.2549 + "jg 1b \n" 1.2550 + : [y_buf]"+r"(y_buf), // %[y_buf] 1.2551 + [u_buf]"+r"(u_buf), // %[u_buf] 1.2552 + [v_buf]"+r"(v_buf), // %[v_buf] 1.2553 + [dst_argb]"+r"(dst_argb), // %[dst_argb] 1.2554 + [width]"+rm"(width) // %[width] 1.2555 + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1.2556 + : "memory", "cc" 1.2557 +#if defined(__native_client__) && defined(__x86_64__) 1.2558 + , "r14" 1.2559 +#endif 1.2560 +#if defined(__SSE2__) 1.2561 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.2562 +#endif 1.2563 + ); 1.2564 +} 1.2565 + 1.2566 +void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1.2567 + const uint8* u_buf, 1.2568 + const uint8* v_buf, 1.2569 + uint8* dst_argb, 1.2570 + int width) { 1.2571 + asm volatile ( 1.2572 + "sub %[u_buf],%[v_buf] \n" 1.2573 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2574 + "pxor %%xmm4,%%xmm4 \n" 1.2575 + LABELALIGN 1.2576 + "1: \n" 1.2577 + READYUV411 1.2578 + YUVTORGB 1.2579 + "punpcklbw %%xmm1,%%xmm0 \n" 1.2580 + "punpcklbw %%xmm5,%%xmm2 \n" 1.2581 + "movdqa %%xmm0,%%xmm1 \n" 1.2582 + "punpcklwd %%xmm2,%%xmm0 \n" 1.2583 + "punpckhwd %%xmm2,%%xmm1 \n" 1.2584 + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" 1.2585 + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" 1.2586 + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 1.2587 + "sub $0x8,%[width] \n" 1.2588 + "jg 1b \n" 1.2589 + : [y_buf]"+r"(y_buf), // %[y_buf] 1.2590 + [u_buf]"+r"(u_buf), // %[u_buf] 1.2591 + [v_buf]"+r"(v_buf), // %[v_buf] 1.2592 + [dst_argb]"+r"(dst_argb), // %[dst_argb] 1.2593 + [width]"+rm"(width) // %[width] 1.2594 + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1.2595 + : "memory", "cc" 1.2596 +#if defined(__native_client__) && defined(__x86_64__) 1.2597 + , "r14" 1.2598 +#endif 1.2599 +#if defined(__SSE2__) 1.2600 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.2601 +#endif 1.2602 + ); 1.2603 +} 1.2604 + 1.2605 +void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1.2606 + const uint8* uv_buf, 1.2607 + uint8* dst_argb, 1.2608 + int width) { 1.2609 + asm volatile ( 1.2610 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2611 + "pxor %%xmm4,%%xmm4 \n" 1.2612 + LABELALIGN 1.2613 + "1: \n" 1.2614 + READNV12 1.2615 + YUVTORGB 1.2616 + "punpcklbw %%xmm1,%%xmm0 \n" 1.2617 + "punpcklbw %%xmm5,%%xmm2 \n" 1.2618 + "movdqa %%xmm0,%%xmm1 \n" 1.2619 + "punpcklwd %%xmm2,%%xmm0 \n" 1.2620 + "punpckhwd %%xmm2,%%xmm1 \n" 1.2621 + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" 1.2622 + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" 1.2623 + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 1.2624 + "sub $0x8,%[width] \n" 1.2625 + "jg 1b \n" 1.2626 + : [y_buf]"+r"(y_buf), // %[y_buf] 1.2627 + [uv_buf]"+r"(uv_buf), // %[uv_buf] 1.2628 + [dst_argb]"+r"(dst_argb), // %[dst_argb] 1.2629 + [width]"+rm"(width) // %[width] 1.2630 + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1.2631 + : "memory", "cc" 1.2632 + // Does not use r14. 1.2633 +#if defined(__SSE2__) 1.2634 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.2635 +#endif 1.2636 + ); 1.2637 +} 1.2638 + 1.2639 +void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 1.2640 + const uint8* uv_buf, 1.2641 + uint8* dst_argb, 1.2642 + int width) { 1.2643 + asm volatile ( 1.2644 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2645 + "pxor %%xmm4,%%xmm4 \n" 1.2646 + LABELALIGN 1.2647 + "1: \n" 1.2648 + READNV12 1.2649 + YVUTORGB 1.2650 + "punpcklbw %%xmm1,%%xmm0 \n" 1.2651 + "punpcklbw %%xmm5,%%xmm2 \n" 1.2652 + "movdqa %%xmm0,%%xmm1 \n" 1.2653 + "punpcklwd %%xmm2,%%xmm0 \n" 1.2654 + "punpckhwd %%xmm2,%%xmm1 \n" 1.2655 + "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" 1.2656 + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" 1.2657 + "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" 1.2658 + "sub $0x8,%[width] \n" 1.2659 + "jg 1b \n" 1.2660 + : [y_buf]"+r"(y_buf), // %[y_buf] 1.2661 + [uv_buf]"+r"(uv_buf), // %[uv_buf] 1.2662 + [dst_argb]"+r"(dst_argb), // %[dst_argb] 1.2663 + [width]"+rm"(width) // %[width] 1.2664 + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1.2665 + : "memory", "cc" 1.2666 + // Does not use r14. 1.2667 +#if defined(__SSE2__) 1.2668 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.2669 +#endif 1.2670 + ); 1.2671 +} 1.2672 + 1.2673 +void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, 1.2674 + const uint8* u_buf, 1.2675 + const uint8* v_buf, 1.2676 + uint8* dst_bgra, 1.2677 + int width) { 1.2678 + asm volatile ( 1.2679 + "sub %[u_buf],%[v_buf] \n" 1.2680 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2681 + "pxor %%xmm4,%%xmm4 \n" 1.2682 + LABELALIGN 1.2683 + "1: \n" 1.2684 + READYUV422 1.2685 + YUVTORGB 1.2686 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2687 + "punpcklbw %%xmm0,%%xmm1 \n" 1.2688 + "punpcklbw %%xmm2,%%xmm5 \n" 1.2689 + "movdqa %%xmm5,%%xmm0 \n" 1.2690 + "punpcklwd %%xmm1,%%xmm5 \n" 1.2691 + "punpckhwd %%xmm1,%%xmm0 \n" 1.2692 + "movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n" 1.2693 + "movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" 1.2694 + "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" 1.2695 + "sub $0x8,%[width] \n" 1.2696 + "jg 1b \n" 1.2697 + : [y_buf]"+r"(y_buf), // %[y_buf] 1.2698 + [u_buf]"+r"(u_buf), // %[u_buf] 1.2699 + [v_buf]"+r"(v_buf), // %[v_buf] 1.2700 + [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] 1.2701 + [width]"+rm"(width) // %[width] 1.2702 + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1.2703 + : "memory", "cc" 1.2704 +#if defined(__native_client__) && defined(__x86_64__) 1.2705 + , "r14" 1.2706 +#endif 1.2707 +#if defined(__SSE2__) 1.2708 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.2709 +#endif 1.2710 + ); 1.2711 +} 1.2712 + 1.2713 +void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, 1.2714 + const uint8* u_buf, 1.2715 + const uint8* v_buf, 1.2716 + uint8* dst_abgr, 1.2717 + int width) { 1.2718 + asm volatile ( 1.2719 + "sub %[u_buf],%[v_buf] \n" 1.2720 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2721 + "pxor %%xmm4,%%xmm4 \n" 1.2722 + LABELALIGN 1.2723 + "1: \n" 1.2724 + READYUV422 1.2725 + YUVTORGB 1.2726 + "punpcklbw %%xmm1,%%xmm2 \n" 1.2727 + "punpcklbw %%xmm5,%%xmm0 \n" 1.2728 + "movdqa %%xmm2,%%xmm1 \n" 1.2729 + "punpcklwd %%xmm0,%%xmm2 \n" 1.2730 + "punpckhwd %%xmm0,%%xmm1 \n" 1.2731 + "movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n" 1.2732 + "movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" 1.2733 + "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" 1.2734 + "sub $0x8,%[width] \n" 1.2735 + "jg 1b \n" 1.2736 + : [y_buf]"+r"(y_buf), // %[y_buf] 1.2737 + [u_buf]"+r"(u_buf), // %[u_buf] 1.2738 + [v_buf]"+r"(v_buf), // %[v_buf] 1.2739 + [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] 1.2740 + [width]"+rm"(width) // %[width] 1.2741 + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1.2742 + : "memory", "cc" 1.2743 +#if defined(__native_client__) && defined(__x86_64__) 1.2744 + , "r14" 1.2745 +#endif 1.2746 +#if defined(__SSE2__) 1.2747 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.2748 +#endif 1.2749 + ); 1.2750 +} 1.2751 + 1.2752 +void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, 1.2753 + const uint8* u_buf, 1.2754 + const uint8* v_buf, 1.2755 + uint8* dst_rgba, 1.2756 + int width) { 1.2757 + asm volatile ( 1.2758 + "sub %[u_buf],%[v_buf] \n" 1.2759 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2760 + "pxor %%xmm4,%%xmm4 \n" 1.2761 + LABELALIGN 1.2762 + "1: \n" 1.2763 + READYUV422 1.2764 + YUVTORGB 1.2765 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2766 + "punpcklbw %%xmm2,%%xmm1 \n" 1.2767 + "punpcklbw %%xmm0,%%xmm5 \n" 1.2768 + "movdqa %%xmm5,%%xmm0 \n" 1.2769 + "punpcklwd %%xmm1,%%xmm5 \n" 1.2770 + "punpckhwd %%xmm1,%%xmm0 \n" 1.2771 + "movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n" 1.2772 + "movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" 1.2773 + "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" 1.2774 + "sub $0x8,%[width] \n" 1.2775 + "jg 1b \n" 1.2776 + : [y_buf]"+r"(y_buf), // %[y_buf] 1.2777 + [u_buf]"+r"(u_buf), // %[u_buf] 1.2778 + [v_buf]"+r"(v_buf), // %[v_buf] 1.2779 + [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] 1.2780 + [width]"+rm"(width) // %[width] 1.2781 + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1.2782 + : "memory", "cc" 1.2783 +#if defined(__native_client__) && defined(__x86_64__) 1.2784 + , "r14" 1.2785 +#endif 1.2786 +#if defined(__SSE2__) 1.2787 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.2788 +#endif 1.2789 + ); 1.2790 +} 1.2791 + 1.2792 +void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, 1.2793 + const uint8* u_buf, 1.2794 + const uint8* v_buf, 1.2795 + uint8* dst_bgra, 1.2796 + int width) { 1.2797 + asm volatile ( 1.2798 + "sub %[u_buf],%[v_buf] \n" 1.2799 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2800 + "pxor %%xmm4,%%xmm4 \n" 1.2801 + LABELALIGN 1.2802 + "1: \n" 1.2803 + READYUV422 1.2804 + YUVTORGB 1.2805 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2806 + "punpcklbw %%xmm0,%%xmm1 \n" 1.2807 + "punpcklbw %%xmm2,%%xmm5 \n" 1.2808 + "movdqa %%xmm5,%%xmm0 \n" 1.2809 + "punpcklwd %%xmm1,%%xmm5 \n" 1.2810 + "punpckhwd %%xmm1,%%xmm0 \n" 1.2811 + "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n" 1.2812 + "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" 1.2813 + "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" 1.2814 + "sub $0x8,%[width] \n" 1.2815 + "jg 1b \n" 1.2816 + : [y_buf]"+r"(y_buf), // %[y_buf] 1.2817 + [u_buf]"+r"(u_buf), // %[u_buf] 1.2818 + [v_buf]"+r"(v_buf), // %[v_buf] 1.2819 + [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] 1.2820 + [width]"+rm"(width) // %[width] 1.2821 + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1.2822 + : "memory", "cc" 1.2823 +#if defined(__native_client__) && defined(__x86_64__) 1.2824 + , "r14" 1.2825 +#endif 1.2826 +#if defined(__SSE2__) 1.2827 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.2828 +#endif 1.2829 + ); 1.2830 +} 1.2831 + 1.2832 +void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, 1.2833 + const uint8* u_buf, 1.2834 + const uint8* v_buf, 1.2835 + uint8* dst_abgr, 1.2836 + int width) { 1.2837 + asm volatile ( 1.2838 + "sub %[u_buf],%[v_buf] \n" 1.2839 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2840 + "pxor %%xmm4,%%xmm4 \n" 1.2841 + LABELALIGN 1.2842 + "1: \n" 1.2843 + READYUV422 1.2844 + YUVTORGB 1.2845 + "punpcklbw %%xmm1,%%xmm2 \n" 1.2846 + "punpcklbw %%xmm5,%%xmm0 \n" 1.2847 + "movdqa %%xmm2,%%xmm1 \n" 1.2848 + "punpcklwd %%xmm0,%%xmm2 \n" 1.2849 + "punpckhwd %%xmm0,%%xmm1 \n" 1.2850 + "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n" 1.2851 + "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" 1.2852 + "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" 1.2853 + "sub $0x8,%[width] \n" 1.2854 + "jg 1b \n" 1.2855 + : [y_buf]"+r"(y_buf), // %[y_buf] 1.2856 + [u_buf]"+r"(u_buf), // %[u_buf] 1.2857 + [v_buf]"+r"(v_buf), // %[v_buf] 1.2858 + [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] 1.2859 + [width]"+rm"(width) // %[width] 1.2860 + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1.2861 + : "memory", "cc" 1.2862 +#if defined(__native_client__) && defined(__x86_64__) 1.2863 + , "r14" 1.2864 +#endif 1.2865 +#if defined(__SSE2__) 1.2866 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.2867 +#endif 1.2868 + ); 1.2869 +} 1.2870 + 1.2871 +void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, 1.2872 + const uint8* u_buf, 1.2873 + const uint8* v_buf, 1.2874 + uint8* dst_rgba, 1.2875 + int width) { 1.2876 + asm volatile ( 1.2877 + "sub %[u_buf],%[v_buf] \n" 1.2878 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2879 + "pxor %%xmm4,%%xmm4 \n" 1.2880 + LABELALIGN 1.2881 + "1: \n" 1.2882 + READYUV422 1.2883 + YUVTORGB 1.2884 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.2885 + "punpcklbw %%xmm2,%%xmm1 \n" 1.2886 + "punpcklbw %%xmm0,%%xmm5 \n" 1.2887 + "movdqa %%xmm5,%%xmm0 \n" 1.2888 + "punpcklwd %%xmm1,%%xmm5 \n" 1.2889 + "punpckhwd %%xmm1,%%xmm0 \n" 1.2890 + "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n" 1.2891 + "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" 1.2892 + "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" 1.2893 + "sub $0x8,%[width] \n" 1.2894 + "jg 1b \n" 1.2895 + : [y_buf]"+r"(y_buf), // %[y_buf] 1.2896 + [u_buf]"+r"(u_buf), // %[u_buf] 1.2897 + [v_buf]"+r"(v_buf), // %[v_buf] 1.2898 + [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] 1.2899 + [width]"+rm"(width) // %[width] 1.2900 + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] 1.2901 + : "memory", "cc" 1.2902 +#if defined(__native_client__) && defined(__x86_64__) 1.2903 + , "r14" 1.2904 +#endif 1.2905 +#if defined(__SSE2__) 1.2906 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.2907 +#endif 1.2908 + ); 1.2909 +} 1.2910 + 1.2911 +#endif // HAS_I422TOARGBROW_SSSE3 1.2912 + 1.2913 +#ifdef HAS_YTOARGBROW_SSE2 1.2914 +void YToARGBRow_SSE2(const uint8* y_buf, 1.2915 + uint8* dst_argb, 1.2916 + int width) { 1.2917 + asm volatile ( 1.2918 + "pxor %%xmm5,%%xmm5 \n" 1.2919 + "pcmpeqb %%xmm4,%%xmm4 \n" 1.2920 + "pslld $0x18,%%xmm4 \n" 1.2921 + "mov $0x00100010,%%eax \n" 1.2922 + "movd %%eax,%%xmm3 \n" 1.2923 + "pshufd $0x0,%%xmm3,%%xmm3 \n" 1.2924 + "mov $0x004a004a,%%eax \n" 1.2925 + "movd %%eax,%%xmm2 \n" 1.2926 + "pshufd $0x0,%%xmm2,%%xmm2 \n" 1.2927 + LABELALIGN 1.2928 + "1: \n" 1.2929 + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 1.2930 + "movq " MEMACCESS(0) ",%%xmm0 \n" 1.2931 + "lea " MEMLEA(0x8,0) ",%0 \n" 1.2932 + "punpcklbw %%xmm5,%%xmm0 \n" 1.2933 + "psubusw %%xmm3,%%xmm0 \n" 1.2934 + "pmullw %%xmm2,%%xmm0 \n" 1.2935 + "psrlw $6, %%xmm0 \n" 1.2936 + "packuswb %%xmm0,%%xmm0 \n" 1.2937 + 1.2938 + // Step 2: Weave into ARGB 1.2939 + "punpcklbw %%xmm0,%%xmm0 \n" 1.2940 + "movdqa %%xmm0,%%xmm1 \n" 1.2941 + "punpcklwd %%xmm0,%%xmm0 \n" 1.2942 + "punpckhwd %%xmm1,%%xmm1 \n" 1.2943 + "por %%xmm4,%%xmm0 \n" 1.2944 + "por %%xmm4,%%xmm1 \n" 1.2945 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.2946 + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" 1.2947 + "lea " MEMLEA(0x20,1) ",%1 \n" 1.2948 + 1.2949 + "sub $0x8,%2 \n" 1.2950 + "jg 1b \n" 1.2951 + : "+r"(y_buf), // %0 1.2952 + "+r"(dst_argb), // %1 1.2953 + "+rm"(width) // %2 1.2954 + : 1.2955 + : "memory", "cc", "eax" 1.2956 +#if defined(__SSE2__) 1.2957 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 1.2958 +#endif 1.2959 + ); 1.2960 +} 1.2961 +#endif // HAS_YTOARGBROW_SSE2 1.2962 + 1.2963 +#ifdef HAS_MIRRORROW_SSSE3 1.2964 +// Shuffle table for reversing the bytes. 1.2965 +static uvec8 kShuffleMirror = { 1.2966 + 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 1.2967 +}; 1.2968 + 1.2969 +void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 1.2970 + intptr_t temp_width = (intptr_t)(width); 1.2971 + asm volatile ( 1.2972 + "movdqa %3,%%xmm5 \n" 1.2973 + "lea " MEMLEA(-0x10,0) ",%0 \n" 1.2974 + LABELALIGN 1.2975 + "1: \n" 1.2976 + MEMOPREG(movdqa,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0 1.2977 + "pshufb %%xmm5,%%xmm0 \n" 1.2978 + "sub $0x10,%2 \n" 1.2979 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.2980 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.2981 + "jg 1b \n" 1.2982 + : "+r"(src), // %0 1.2983 + "+r"(dst), // %1 1.2984 + "+r"(temp_width) // %2 1.2985 + : "m"(kShuffleMirror) // %3 1.2986 + : "memory", "cc" 1.2987 +#if defined(__native_client__) && defined(__x86_64__) 1.2988 + , "r14" 1.2989 +#endif 1.2990 +#if defined(__SSE2__) 1.2991 + , "xmm0", "xmm5" 1.2992 +#endif 1.2993 + ); 1.2994 +} 1.2995 +#endif // HAS_MIRRORROW_SSSE3 1.2996 + 1.2997 +#ifdef HAS_MIRRORROW_SSE2 1.2998 +void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { 1.2999 + intptr_t temp_width = (intptr_t)(width); 1.3000 + asm volatile ( 1.3001 + "lea " MEMLEA(-0x10,0) ",%0 \n" 1.3002 + LABELALIGN 1.3003 + "1: \n" 1.3004 + MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0 1.3005 + "movdqa %%xmm0,%%xmm1 \n" 1.3006 + "psllw $0x8,%%xmm0 \n" 1.3007 + "psrlw $0x8,%%xmm1 \n" 1.3008 + "por %%xmm1,%%xmm0 \n" 1.3009 + "pshuflw $0x1b,%%xmm0,%%xmm0 \n" 1.3010 + "pshufhw $0x1b,%%xmm0,%%xmm0 \n" 1.3011 + "pshufd $0x4e,%%xmm0,%%xmm0 \n" 1.3012 + "sub $0x10,%2 \n" 1.3013 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.3014 + "lea " MEMLEA(0x10,1)",%1 \n" 1.3015 + "jg 1b \n" 1.3016 + : "+r"(src), // %0 1.3017 + "+r"(dst), // %1 1.3018 + "+r"(temp_width) // %2 1.3019 + : 1.3020 + : "memory", "cc" 1.3021 +#if defined(__native_client__) && defined(__x86_64__) 1.3022 + , "r14" 1.3023 +#endif 1.3024 +#if defined(__SSE2__) 1.3025 + , "xmm0", "xmm1" 1.3026 +#endif 1.3027 + ); 1.3028 +} 1.3029 +#endif // HAS_MIRRORROW_SSE2 1.3030 + 1.3031 +#ifdef HAS_MIRRORROW_UV_SSSE3 1.3032 +// Shuffle table for reversing the bytes of UV channels. 1.3033 +static uvec8 kShuffleMirrorUV = { 1.3034 + 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u 1.3035 +}; 1.3036 +void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, 1.3037 + int width) { 1.3038 + intptr_t temp_width = (intptr_t)(width); 1.3039 + asm volatile ( 1.3040 + "movdqa %4,%%xmm1 \n" 1.3041 + "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" 1.3042 + "sub %1,%2 \n" 1.3043 + LABELALIGN 1.3044 + "1: \n" 1.3045 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.3046 + "lea " MEMLEA(-0x10,0) ",%0 \n" 1.3047 + "pshufb %%xmm1,%%xmm0 \n" 1.3048 + "sub $8,%3 \n" 1.3049 + "movlpd %%xmm0," MEMACCESS(1) " \n" 1.3050 + BUNDLEALIGN 1.3051 + MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2) 1.3052 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.3053 + "jg 1b \n" 1.3054 + : "+r"(src), // %0 1.3055 + "+r"(dst_u), // %1 1.3056 + "+r"(dst_v), // %2 1.3057 + "+r"(temp_width) // %3 1.3058 + : "m"(kShuffleMirrorUV) // %4 1.3059 + : "memory", "cc" 1.3060 +#if defined(__native_client__) && defined(__x86_64__) 1.3061 + , "r14" 1.3062 +#endif 1.3063 +#if defined(__SSE2__) 1.3064 + , "xmm0", "xmm1" 1.3065 +#endif 1.3066 + ); 1.3067 +} 1.3068 +#endif // HAS_MIRRORROW_UV_SSSE3 1.3069 + 1.3070 +#ifdef HAS_ARGBMIRRORROW_SSSE3 1.3071 +// Shuffle table for reversing the bytes. 1.3072 +static uvec8 kARGBShuffleMirror = { 1.3073 + 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u 1.3074 +}; 1.3075 + 1.3076 +void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 1.3077 + intptr_t temp_width = (intptr_t)(width); 1.3078 + asm volatile ( 1.3079 + "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n" 1.3080 + "movdqa %3,%%xmm5 \n" 1.3081 + LABELALIGN 1.3082 + "1: \n" 1.3083 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.3084 + "pshufb %%xmm5,%%xmm0 \n" 1.3085 + "lea " MEMLEA(-0x10,0) ",%0 \n" 1.3086 + "sub $0x4,%2 \n" 1.3087 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.3088 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.3089 + "jg 1b \n" 1.3090 + : "+r"(src), // %0 1.3091 + "+r"(dst), // %1 1.3092 + "+r"(temp_width) // %2 1.3093 + : "m"(kARGBShuffleMirror) // %3 1.3094 + : "memory", "cc" 1.3095 +#if defined(__SSE2__) 1.3096 + , "xmm0", "xmm5" 1.3097 +#endif 1.3098 + ); 1.3099 +} 1.3100 +#endif // HAS_ARGBMIRRORROW_SSSE3 1.3101 + 1.3102 +#ifdef HAS_SPLITUVROW_SSE2 1.3103 +void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 1.3104 + asm volatile ( 1.3105 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.3106 + "psrlw $0x8,%%xmm5 \n" 1.3107 + "sub %1,%2 \n" 1.3108 + LABELALIGN 1.3109 + "1: \n" 1.3110 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.3111 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.3112 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.3113 + "movdqa %%xmm0,%%xmm2 \n" 1.3114 + "movdqa %%xmm1,%%xmm3 \n" 1.3115 + "pand %%xmm5,%%xmm0 \n" 1.3116 + "pand %%xmm5,%%xmm1 \n" 1.3117 + "packuswb %%xmm1,%%xmm0 \n" 1.3118 + "psrlw $0x8,%%xmm2 \n" 1.3119 + "psrlw $0x8,%%xmm3 \n" 1.3120 + "packuswb %%xmm3,%%xmm2 \n" 1.3121 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.3122 + MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2) 1.3123 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.3124 + "sub $0x10,%3 \n" 1.3125 + "jg 1b \n" 1.3126 + : "+r"(src_uv), // %0 1.3127 + "+r"(dst_u), // %1 1.3128 + "+r"(dst_v), // %2 1.3129 + "+r"(pix) // %3 1.3130 + : 1.3131 + : "memory", "cc" 1.3132 +#if defined(__native_client__) && defined(__x86_64__) 1.3133 + , "r14" 1.3134 +#endif 1.3135 +#if defined(__SSE2__) 1.3136 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 1.3137 +#endif 1.3138 + ); 1.3139 +} 1.3140 + 1.3141 +void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, 1.3142 + int pix) { 1.3143 + asm volatile ( 1.3144 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.3145 + "psrlw $0x8,%%xmm5 \n" 1.3146 + "sub %1,%2 \n" 1.3147 + LABELALIGN 1.3148 + "1: \n" 1.3149 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.3150 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.3151 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.3152 + "movdqa %%xmm0,%%xmm2 \n" 1.3153 + "movdqa %%xmm1,%%xmm3 \n" 1.3154 + "pand %%xmm5,%%xmm0 \n" 1.3155 + "pand %%xmm5,%%xmm1 \n" 1.3156 + "packuswb %%xmm1,%%xmm0 \n" 1.3157 + "psrlw $0x8,%%xmm2 \n" 1.3158 + "psrlw $0x8,%%xmm3 \n" 1.3159 + "packuswb %%xmm3,%%xmm2 \n" 1.3160 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.3161 + MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) 1.3162 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.3163 + "sub $0x10,%3 \n" 1.3164 + "jg 1b \n" 1.3165 + : "+r"(src_uv), // %0 1.3166 + "+r"(dst_u), // %1 1.3167 + "+r"(dst_v), // %2 1.3168 + "+r"(pix) // %3 1.3169 + : 1.3170 + : "memory", "cc" 1.3171 +#if defined(__native_client__) && defined(__x86_64__) 1.3172 + , "r14" 1.3173 +#endif 1.3174 +#if defined(__SSE2__) 1.3175 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 1.3176 +#endif 1.3177 + ); 1.3178 +} 1.3179 +#endif // HAS_SPLITUVROW_SSE2 1.3180 + 1.3181 +#ifdef HAS_MERGEUVROW_SSE2 1.3182 +void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 1.3183 + int width) { 1.3184 + asm volatile ( 1.3185 + "sub %0,%1 \n" 1.3186 + LABELALIGN 1.3187 + "1: \n" 1.3188 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.3189 + MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 1.3190 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.3191 + "movdqa %%xmm0,%%xmm2 \n" 1.3192 + "punpcklbw %%xmm1,%%xmm0 \n" 1.3193 + "punpckhbw %%xmm1,%%xmm2 \n" 1.3194 + "movdqa %%xmm0," MEMACCESS(2) " \n" 1.3195 + "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" 1.3196 + "lea " MEMLEA(0x20,2) ",%2 \n" 1.3197 + "sub $0x10,%3 \n" 1.3198 + "jg 1b \n" 1.3199 + : "+r"(src_u), // %0 1.3200 + "+r"(src_v), // %1 1.3201 + "+r"(dst_uv), // %2 1.3202 + "+r"(width) // %3 1.3203 + : 1.3204 + : "memory", "cc" 1.3205 +#if defined(__native_client__) && defined(__x86_64__) 1.3206 + , "r14" 1.3207 +#endif 1.3208 +#if defined(__SSE2__) 1.3209 + , "xmm0", "xmm1", "xmm2" 1.3210 +#endif 1.3211 + ); 1.3212 +} 1.3213 + 1.3214 +void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, 1.3215 + uint8* dst_uv, int width) { 1.3216 + asm volatile ( 1.3217 + "sub %0,%1 \n" 1.3218 + LABELALIGN 1.3219 + "1: \n" 1.3220 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.3221 + MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 1.3222 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.3223 + "movdqa %%xmm0,%%xmm2 \n" 1.3224 + "punpcklbw %%xmm1,%%xmm0 \n" 1.3225 + "punpckhbw %%xmm1,%%xmm2 \n" 1.3226 + "movdqu %%xmm0," MEMACCESS(2) " \n" 1.3227 + "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" 1.3228 + "lea " MEMLEA(0x20,2) ",%2 \n" 1.3229 + "sub $0x10,%3 \n" 1.3230 + "jg 1b \n" 1.3231 + : "+r"(src_u), // %0 1.3232 + "+r"(src_v), // %1 1.3233 + "+r"(dst_uv), // %2 1.3234 + "+r"(width) // %3 1.3235 + : 1.3236 + : "memory", "cc" 1.3237 +#if defined(__native_client__) && defined(__x86_64__) 1.3238 + , "r14" 1.3239 +#endif 1.3240 +#if defined(__SSE2__) 1.3241 + , "xmm0", "xmm1", "xmm2" 1.3242 +#endif 1.3243 + ); 1.3244 +} 1.3245 +#endif // HAS_MERGEUVROW_SSE2 1.3246 + 1.3247 +#ifdef HAS_COPYROW_SSE2 1.3248 +void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { 1.3249 + asm volatile ( 1.3250 + LABELALIGN 1.3251 + "1: \n" 1.3252 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.3253 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.3254 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.3255 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.3256 + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" 1.3257 + "lea " MEMLEA(0x20,1) ",%1 \n" 1.3258 + "sub $0x20,%2 \n" 1.3259 + "jg 1b \n" 1.3260 + : "+r"(src), // %0 1.3261 + "+r"(dst), // %1 1.3262 + "+r"(count) // %2 1.3263 + : 1.3264 + : "memory", "cc" 1.3265 +#if defined(__SSE2__) 1.3266 + , "xmm0", "xmm1" 1.3267 +#endif 1.3268 + ); 1.3269 +} 1.3270 +#endif // HAS_COPYROW_SSE2 1.3271 + 1.3272 +#ifdef HAS_COPYROW_X86 1.3273 +void CopyRow_X86(const uint8* src, uint8* dst, int width) { 1.3274 + size_t width_tmp = (size_t)(width); 1.3275 + asm volatile ( 1.3276 + "shr $0x2,%2 \n" 1.3277 + "rep movsl " MEMMOVESTRING(0,1) " \n" 1.3278 + : "+S"(src), // %0 1.3279 + "+D"(dst), // %1 1.3280 + "+c"(width_tmp) // %2 1.3281 + : 1.3282 + : "memory", "cc" 1.3283 + ); 1.3284 +} 1.3285 +#endif // HAS_COPYROW_X86 1.3286 + 1.3287 +#ifdef HAS_COPYROW_ERMS 1.3288 +// Unaligned Multiple of 1. 1.3289 +void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { 1.3290 + size_t width_tmp = (size_t)(width); 1.3291 + asm volatile ( 1.3292 + "rep movsb " MEMMOVESTRING(0,1) " \n" 1.3293 + : "+S"(src), // %0 1.3294 + "+D"(dst), // %1 1.3295 + "+c"(width_tmp) // %2 1.3296 + : 1.3297 + : "memory", "cc" 1.3298 + ); 1.3299 +} 1.3300 +#endif // HAS_COPYROW_ERMS 1.3301 + 1.3302 +#ifdef HAS_ARGBCOPYALPHAROW_SSE2 1.3303 +// width in pixels 1.3304 +void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 1.3305 + asm volatile ( 1.3306 + "pcmpeqb %%xmm0,%%xmm0 \n" 1.3307 + "pslld $0x18,%%xmm0 \n" 1.3308 + "pcmpeqb %%xmm1,%%xmm1 \n" 1.3309 + "psrld $0x8,%%xmm1 \n" 1.3310 + LABELALIGN 1.3311 + "1: \n" 1.3312 + "movdqa " MEMACCESS(0) ",%%xmm2 \n" 1.3313 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" 1.3314 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.3315 + "movdqa " MEMACCESS(1) ",%%xmm4 \n" 1.3316 + "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" 1.3317 + "pand %%xmm0,%%xmm2 \n" 1.3318 + "pand %%xmm0,%%xmm3 \n" 1.3319 + "pand %%xmm1,%%xmm4 \n" 1.3320 + "pand %%xmm1,%%xmm5 \n" 1.3321 + "por %%xmm4,%%xmm2 \n" 1.3322 + "por %%xmm5,%%xmm3 \n" 1.3323 + "movdqa %%xmm2," MEMACCESS(1) " \n" 1.3324 + "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" 1.3325 + "lea " MEMLEA(0x20,1) ",%1 \n" 1.3326 + "sub $0x8,%2 \n" 1.3327 + "jg 1b \n" 1.3328 + : "+r"(src), // %0 1.3329 + "+r"(dst), // %1 1.3330 + "+r"(width) // %2 1.3331 + : 1.3332 + : "memory", "cc" 1.3333 +#if defined(__SSE2__) 1.3334 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.3335 +#endif 1.3336 + ); 1.3337 +} 1.3338 +#endif // HAS_ARGBCOPYALPHAROW_SSE2 1.3339 + 1.3340 +#ifdef HAS_ARGBCOPYALPHAROW_AVX2 1.3341 +// width in pixels 1.3342 +void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 1.3343 + asm volatile ( 1.3344 + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" 1.3345 + "vpsrld $0x8,%%ymm0,%%ymm0 \n" 1.3346 + LABELALIGN 1.3347 + "1: \n" 1.3348 + "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" 1.3349 + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" 1.3350 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.3351 + "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" 1.3352 + "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" 1.3353 + "vmovdqu %%ymm1," MEMACCESS(1) " \n" 1.3354 + "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" 1.3355 + "lea " MEMLEA(0x40,1) ",%1 \n" 1.3356 + "sub $0x10,%2 \n" 1.3357 + "jg 1b \n" 1.3358 + "vzeroupper \n" 1.3359 + : "+r"(src), // %0 1.3360 + "+r"(dst), // %1 1.3361 + "+r"(width) // %2 1.3362 + : 1.3363 + : "memory", "cc" 1.3364 +#if defined(__SSE2__) 1.3365 + , "xmm0", "xmm1", "xmm2" 1.3366 +#endif 1.3367 + ); 1.3368 +} 1.3369 +#endif // HAS_ARGBCOPYALPHAROW_AVX2 1.3370 + 1.3371 +#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 1.3372 +// width in pixels 1.3373 +void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { 1.3374 + asm volatile ( 1.3375 + "pcmpeqb %%xmm0,%%xmm0 \n" 1.3376 + "pslld $0x18,%%xmm0 \n" 1.3377 + "pcmpeqb %%xmm1,%%xmm1 \n" 1.3378 + "psrld $0x8,%%xmm1 \n" 1.3379 + LABELALIGN 1.3380 + "1: \n" 1.3381 + "movq " MEMACCESS(0) ",%%xmm2 \n" 1.3382 + "lea " MEMLEA(0x8,0) ",%0 \n" 1.3383 + "punpcklbw %%xmm2,%%xmm2 \n" 1.3384 + "punpckhwd %%xmm2,%%xmm3 \n" 1.3385 + "punpcklwd %%xmm2,%%xmm2 \n" 1.3386 + "movdqa " MEMACCESS(1) ",%%xmm4 \n" 1.3387 + "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" 1.3388 + "pand %%xmm0,%%xmm2 \n" 1.3389 + "pand %%xmm0,%%xmm3 \n" 1.3390 + "pand %%xmm1,%%xmm4 \n" 1.3391 + "pand %%xmm1,%%xmm5 \n" 1.3392 + "por %%xmm4,%%xmm2 \n" 1.3393 + "por %%xmm5,%%xmm3 \n" 1.3394 + "movdqa %%xmm2," MEMACCESS(1) " \n" 1.3395 + "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" 1.3396 + "lea " MEMLEA(0x20,1) ",%1 \n" 1.3397 + "sub $0x8,%2 \n" 1.3398 + "jg 1b \n" 1.3399 + : "+r"(src), // %0 1.3400 + "+r"(dst), // %1 1.3401 + "+r"(width) // %2 1.3402 + : 1.3403 + : "memory", "cc" 1.3404 +#if defined(__SSE2__) 1.3405 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.3406 +#endif 1.3407 + ); 1.3408 +} 1.3409 +#endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 1.3410 + 1.3411 +#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 1.3412 +// width in pixels 1.3413 +void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { 1.3414 + asm volatile ( 1.3415 + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" 1.3416 + "vpsrld $0x8,%%ymm0,%%ymm0 \n" 1.3417 + LABELALIGN 1.3418 + "1: \n" 1.3419 + "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n" 1.3420 + "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n" 1.3421 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.3422 + "vpslld $0x18,%%ymm1,%%ymm1 \n" 1.3423 + "vpslld $0x18,%%ymm2,%%ymm2 \n" 1.3424 + "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" 1.3425 + "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" 1.3426 + "vmovdqu %%ymm1," MEMACCESS(1) " \n" 1.3427 + "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" 1.3428 + "lea " MEMLEA(0x40,1) ",%1 \n" 1.3429 + "sub $0x10,%2 \n" 1.3430 + "jg 1b \n" 1.3431 + "vzeroupper \n" 1.3432 + : "+r"(src), // %0 1.3433 + "+r"(dst), // %1 1.3434 + "+r"(width) // %2 1.3435 + : 1.3436 + : "memory", "cc" 1.3437 +#if defined(__SSE2__) 1.3438 + , "xmm0", "xmm1", "xmm2" 1.3439 +#endif 1.3440 + ); 1.3441 +} 1.3442 +#endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 1.3443 + 1.3444 +#ifdef HAS_SETROW_X86 1.3445 +void SetRow_X86(uint8* dst, uint32 v32, int width) { 1.3446 + size_t width_tmp = (size_t)(width); 1.3447 + asm volatile ( 1.3448 + "shr $0x2,%1 \n" 1.3449 + "rep stosl " MEMSTORESTRING(eax,0) " \n" 1.3450 + : "+D"(dst), // %0 1.3451 + "+c"(width_tmp) // %1 1.3452 + : "a"(v32) // %2 1.3453 + : "memory", "cc"); 1.3454 +} 1.3455 + 1.3456 +void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, 1.3457 + int dst_stride, int height) { 1.3458 + for (int y = 0; y < height; ++y) { 1.3459 + size_t width_tmp = (size_t)(width); 1.3460 + uint32* d = (uint32*)(dst); 1.3461 + asm volatile ( 1.3462 + "rep stosl " MEMSTORESTRING(eax,0) " \n" 1.3463 + : "+D"(d), // %0 1.3464 + "+c"(width_tmp) // %1 1.3465 + : "a"(v32) // %2 1.3466 + : "memory", "cc"); 1.3467 + dst += dst_stride; 1.3468 + } 1.3469 +} 1.3470 +#endif // HAS_SETROW_X86 1.3471 + 1.3472 +#ifdef HAS_YUY2TOYROW_SSE2 1.3473 +void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { 1.3474 + asm volatile ( 1.3475 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.3476 + "psrlw $0x8,%%xmm5 \n" 1.3477 + LABELALIGN 1.3478 + "1: \n" 1.3479 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.3480 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.3481 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.3482 + "pand %%xmm5,%%xmm0 \n" 1.3483 + "pand %%xmm5,%%xmm1 \n" 1.3484 + "packuswb %%xmm1,%%xmm0 \n" 1.3485 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.3486 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.3487 + "sub $0x10,%2 \n" 1.3488 + "jg 1b \n" 1.3489 + : "+r"(src_yuy2), // %0 1.3490 + "+r"(dst_y), // %1 1.3491 + "+r"(pix) // %2 1.3492 + : 1.3493 + : "memory", "cc" 1.3494 +#if defined(__SSE2__) 1.3495 + , "xmm0", "xmm1", "xmm5" 1.3496 +#endif 1.3497 + ); 1.3498 +} 1.3499 + 1.3500 +void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, 1.3501 + uint8* dst_u, uint8* dst_v, int pix) { 1.3502 + asm volatile ( 1.3503 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.3504 + "psrlw $0x8,%%xmm5 \n" 1.3505 + "sub %1,%2 \n" 1.3506 + LABELALIGN 1.3507 + "1: \n" 1.3508 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.3509 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.3510 + BUNDLEALIGN 1.3511 + MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 1.3512 + MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 1.3513 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.3514 + "pavgb %%xmm2,%%xmm0 \n" 1.3515 + "pavgb %%xmm3,%%xmm1 \n" 1.3516 + "psrlw $0x8,%%xmm0 \n" 1.3517 + "psrlw $0x8,%%xmm1 \n" 1.3518 + "packuswb %%xmm1,%%xmm0 \n" 1.3519 + "movdqa %%xmm0,%%xmm1 \n" 1.3520 + "pand %%xmm5,%%xmm0 \n" 1.3521 + "packuswb %%xmm0,%%xmm0 \n" 1.3522 + "psrlw $0x8,%%xmm1 \n" 1.3523 + "packuswb %%xmm1,%%xmm1 \n" 1.3524 + "movq %%xmm0," MEMACCESS(1) " \n" 1.3525 + BUNDLEALIGN 1.3526 + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 1.3527 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.3528 + "sub $0x10,%3 \n" 1.3529 + "jg 1b \n" 1.3530 + : "+r"(src_yuy2), // %0 1.3531 + "+r"(dst_u), // %1 1.3532 + "+r"(dst_v), // %2 1.3533 + "+r"(pix) // %3 1.3534 + : "r"((intptr_t)(stride_yuy2)) // %4 1.3535 + : "memory", "cc" 1.3536 +#if defined(__native_client__) && defined(__x86_64__) 1.3537 + , "r14" 1.3538 +#endif 1.3539 +#if defined(__SSE2__) 1.3540 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 1.3541 +#endif 1.3542 + ); 1.3543 +} 1.3544 + 1.3545 +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 1.3546 + uint8* dst_u, uint8* dst_v, int pix) { 1.3547 + asm volatile ( 1.3548 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.3549 + "psrlw $0x8,%%xmm5 \n" 1.3550 + "sub %1,%2 \n" 1.3551 + LABELALIGN 1.3552 + "1: \n" 1.3553 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.3554 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.3555 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.3556 + "psrlw $0x8,%%xmm0 \n" 1.3557 + "psrlw $0x8,%%xmm1 \n" 1.3558 + "packuswb %%xmm1,%%xmm0 \n" 1.3559 + "movdqa %%xmm0,%%xmm1 \n" 1.3560 + "pand %%xmm5,%%xmm0 \n" 1.3561 + "packuswb %%xmm0,%%xmm0 \n" 1.3562 + "psrlw $0x8,%%xmm1 \n" 1.3563 + "packuswb %%xmm1,%%xmm1 \n" 1.3564 + "movq %%xmm0," MEMACCESS(1) " \n" 1.3565 + BUNDLEALIGN 1.3566 + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 1.3567 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.3568 + "sub $0x10,%3 \n" 1.3569 + "jg 1b \n" 1.3570 + : "+r"(src_yuy2), // %0 1.3571 + "+r"(dst_u), // %1 1.3572 + "+r"(dst_v), // %2 1.3573 + "+r"(pix) // %3 1.3574 + : 1.3575 + : "memory", "cc" 1.3576 +#if defined(__native_client__) && defined(__x86_64__) 1.3577 + , "r14" 1.3578 +#endif 1.3579 +#if defined(__SSE2__) 1.3580 + , "xmm0", "xmm1", "xmm5" 1.3581 +#endif 1.3582 + ); 1.3583 +} 1.3584 + 1.3585 +void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, 1.3586 + uint8* dst_y, int pix) { 1.3587 + asm volatile ( 1.3588 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.3589 + "psrlw $0x8,%%xmm5 \n" 1.3590 + LABELALIGN 1.3591 + "1: \n" 1.3592 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.3593 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.3594 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.3595 + "pand %%xmm5,%%xmm0 \n" 1.3596 + "pand %%xmm5,%%xmm1 \n" 1.3597 + "packuswb %%xmm1,%%xmm0 \n" 1.3598 + "sub $0x10,%2 \n" 1.3599 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.3600 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.3601 + "jg 1b \n" 1.3602 + : "+r"(src_yuy2), // %0 1.3603 + "+r"(dst_y), // %1 1.3604 + "+r"(pix) // %2 1.3605 + : 1.3606 + : "memory", "cc" 1.3607 +#if defined(__SSE2__) 1.3608 + , "xmm0", "xmm1", "xmm5" 1.3609 +#endif 1.3610 + ); 1.3611 +} 1.3612 + 1.3613 +void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, 1.3614 + int stride_yuy2, 1.3615 + uint8* dst_u, uint8* dst_v, int pix) { 1.3616 + asm volatile ( 1.3617 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.3618 + "psrlw $0x8,%%xmm5 \n" 1.3619 + "sub %1,%2 \n" 1.3620 + LABELALIGN 1.3621 + "1: \n" 1.3622 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.3623 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.3624 + BUNDLEALIGN 1.3625 + MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 1.3626 + MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 1.3627 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.3628 + "pavgb %%xmm2,%%xmm0 \n" 1.3629 + "pavgb %%xmm3,%%xmm1 \n" 1.3630 + "psrlw $0x8,%%xmm0 \n" 1.3631 + "psrlw $0x8,%%xmm1 \n" 1.3632 + "packuswb %%xmm1,%%xmm0 \n" 1.3633 + "movdqa %%xmm0,%%xmm1 \n" 1.3634 + "pand %%xmm5,%%xmm0 \n" 1.3635 + "packuswb %%xmm0,%%xmm0 \n" 1.3636 + "psrlw $0x8,%%xmm1 \n" 1.3637 + "packuswb %%xmm1,%%xmm1 \n" 1.3638 + "movq %%xmm0," MEMACCESS(1) " \n" 1.3639 + BUNDLEALIGN 1.3640 + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 1.3641 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.3642 + "sub $0x10,%3 \n" 1.3643 + "jg 1b \n" 1.3644 + : "+r"(src_yuy2), // %0 1.3645 + "+r"(dst_u), // %1 1.3646 + "+r"(dst_v), // %2 1.3647 + "+r"(pix) // %3 1.3648 + : "r"((intptr_t)(stride_yuy2)) // %4 1.3649 + : "memory", "cc" 1.3650 +#if defined(__native_client__) && defined(__x86_64__) 1.3651 + , "r14" 1.3652 +#endif 1.3653 +#if defined(__SSE2__) 1.3654 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 1.3655 +#endif 1.3656 + ); 1.3657 +} 1.3658 + 1.3659 +void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, 1.3660 + uint8* dst_u, uint8* dst_v, int pix) { 1.3661 + asm volatile ( 1.3662 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.3663 + "psrlw $0x8,%%xmm5 \n" 1.3664 + "sub %1,%2 \n" 1.3665 + LABELALIGN 1.3666 + "1: \n" 1.3667 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.3668 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.3669 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.3670 + "psrlw $0x8,%%xmm0 \n" 1.3671 + "psrlw $0x8,%%xmm1 \n" 1.3672 + "packuswb %%xmm1,%%xmm0 \n" 1.3673 + "movdqa %%xmm0,%%xmm1 \n" 1.3674 + "pand %%xmm5,%%xmm0 \n" 1.3675 + "packuswb %%xmm0,%%xmm0 \n" 1.3676 + "psrlw $0x8,%%xmm1 \n" 1.3677 + "packuswb %%xmm1,%%xmm1 \n" 1.3678 + "movq %%xmm0," MEMACCESS(1) " \n" 1.3679 + BUNDLEALIGN 1.3680 + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 1.3681 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.3682 + "sub $0x10,%3 \n" 1.3683 + "jg 1b \n" 1.3684 + : "+r"(src_yuy2), // %0 1.3685 + "+r"(dst_u), // %1 1.3686 + "+r"(dst_v), // %2 1.3687 + "+r"(pix) // %3 1.3688 + : 1.3689 + : "memory", "cc" 1.3690 +#if defined(__native_client__) && defined(__x86_64__) 1.3691 + , "r14" 1.3692 +#endif 1.3693 +#if defined(__SSE2__) 1.3694 + , "xmm0", "xmm1", "xmm5" 1.3695 +#endif 1.3696 + ); 1.3697 +} 1.3698 + 1.3699 +void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { 1.3700 + asm volatile ( 1.3701 + LABELALIGN 1.3702 + "1: \n" 1.3703 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.3704 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.3705 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.3706 + "psrlw $0x8,%%xmm0 \n" 1.3707 + "psrlw $0x8,%%xmm1 \n" 1.3708 + "packuswb %%xmm1,%%xmm0 \n" 1.3709 + "sub $0x10,%2 \n" 1.3710 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.3711 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.3712 + "jg 1b \n" 1.3713 + : "+r"(src_uyvy), // %0 1.3714 + "+r"(dst_y), // %1 1.3715 + "+r"(pix) // %2 1.3716 + : 1.3717 + : "memory", "cc" 1.3718 +#if defined(__SSE2__) 1.3719 + , "xmm0", "xmm1" 1.3720 +#endif 1.3721 + ); 1.3722 +} 1.3723 + 1.3724 +void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, 1.3725 + uint8* dst_u, uint8* dst_v, int pix) { 1.3726 + asm volatile ( 1.3727 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.3728 + "psrlw $0x8,%%xmm5 \n" 1.3729 + "sub %1,%2 \n" 1.3730 + LABELALIGN 1.3731 + "1: \n" 1.3732 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.3733 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.3734 + BUNDLEALIGN 1.3735 + MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 1.3736 + MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 1.3737 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.3738 + "pavgb %%xmm2,%%xmm0 \n" 1.3739 + "pavgb %%xmm3,%%xmm1 \n" 1.3740 + "pand %%xmm5,%%xmm0 \n" 1.3741 + "pand %%xmm5,%%xmm1 \n" 1.3742 + "packuswb %%xmm1,%%xmm0 \n" 1.3743 + "movdqa %%xmm0,%%xmm1 \n" 1.3744 + "pand %%xmm5,%%xmm0 \n" 1.3745 + "packuswb %%xmm0,%%xmm0 \n" 1.3746 + "psrlw $0x8,%%xmm1 \n" 1.3747 + "packuswb %%xmm1,%%xmm1 \n" 1.3748 + "movq %%xmm0," MEMACCESS(1) " \n" 1.3749 + BUNDLEALIGN 1.3750 + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 1.3751 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.3752 + "sub $0x10,%3 \n" 1.3753 + "jg 1b \n" 1.3754 + : "+r"(src_uyvy), // %0 1.3755 + "+r"(dst_u), // %1 1.3756 + "+r"(dst_v), // %2 1.3757 + "+r"(pix) // %3 1.3758 + : "r"((intptr_t)(stride_uyvy)) // %4 1.3759 + : "memory", "cc" 1.3760 +#if defined(__native_client__) && defined(__x86_64__) 1.3761 + , "r14" 1.3762 +#endif 1.3763 +#if defined(__SSE2__) 1.3764 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 1.3765 +#endif 1.3766 + ); 1.3767 +} 1.3768 + 1.3769 +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, 1.3770 + uint8* dst_u, uint8* dst_v, int pix) { 1.3771 + asm volatile ( 1.3772 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.3773 + "psrlw $0x8,%%xmm5 \n" 1.3774 + "sub %1,%2 \n" 1.3775 + LABELALIGN 1.3776 + "1: \n" 1.3777 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.3778 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.3779 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.3780 + "pand %%xmm5,%%xmm0 \n" 1.3781 + "pand %%xmm5,%%xmm1 \n" 1.3782 + "packuswb %%xmm1,%%xmm0 \n" 1.3783 + "movdqa %%xmm0,%%xmm1 \n" 1.3784 + "pand %%xmm5,%%xmm0 \n" 1.3785 + "packuswb %%xmm0,%%xmm0 \n" 1.3786 + "psrlw $0x8,%%xmm1 \n" 1.3787 + "packuswb %%xmm1,%%xmm1 \n" 1.3788 + "movq %%xmm0," MEMACCESS(1) " \n" 1.3789 + BUNDLEALIGN 1.3790 + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 1.3791 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.3792 + "sub $0x10,%3 \n" 1.3793 + "jg 1b \n" 1.3794 + : "+r"(src_uyvy), // %0 1.3795 + "+r"(dst_u), // %1 1.3796 + "+r"(dst_v), // %2 1.3797 + "+r"(pix) // %3 1.3798 + : 1.3799 + : "memory", "cc" 1.3800 +#if defined(__native_client__) && defined(__x86_64__) 1.3801 + , "r14" 1.3802 +#endif 1.3803 +#if defined(__SSE2__) 1.3804 + , "xmm0", "xmm1", "xmm5" 1.3805 +#endif 1.3806 + ); 1.3807 +} 1.3808 + 1.3809 +void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, 1.3810 + uint8* dst_y, int pix) { 1.3811 + asm volatile ( 1.3812 + LABELALIGN 1.3813 + "1: \n" 1.3814 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.3815 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.3816 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.3817 + "psrlw $0x8,%%xmm0 \n" 1.3818 + "psrlw $0x8,%%xmm1 \n" 1.3819 + "packuswb %%xmm1,%%xmm0 \n" 1.3820 + "sub $0x10,%2 \n" 1.3821 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.3822 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.3823 + "jg 1b \n" 1.3824 + : "+r"(src_uyvy), // %0 1.3825 + "+r"(dst_y), // %1 1.3826 + "+r"(pix) // %2 1.3827 + : 1.3828 + : "memory", "cc" 1.3829 +#if defined(__SSE2__) 1.3830 + , "xmm0", "xmm1" 1.3831 +#endif 1.3832 + ); 1.3833 +} 1.3834 + 1.3835 +void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, 1.3836 + uint8* dst_u, uint8* dst_v, int pix) { 1.3837 + asm volatile ( 1.3838 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.3839 + "psrlw $0x8,%%xmm5 \n" 1.3840 + "sub %1,%2 \n" 1.3841 + LABELALIGN 1.3842 + "1: \n" 1.3843 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.3844 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.3845 + BUNDLEALIGN 1.3846 + MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 1.3847 + MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 1.3848 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.3849 + "pavgb %%xmm2,%%xmm0 \n" 1.3850 + "pavgb %%xmm3,%%xmm1 \n" 1.3851 + "pand %%xmm5,%%xmm0 \n" 1.3852 + "pand %%xmm5,%%xmm1 \n" 1.3853 + "packuswb %%xmm1,%%xmm0 \n" 1.3854 + "movdqa %%xmm0,%%xmm1 \n" 1.3855 + "pand %%xmm5,%%xmm0 \n" 1.3856 + "packuswb %%xmm0,%%xmm0 \n" 1.3857 + "psrlw $0x8,%%xmm1 \n" 1.3858 + "packuswb %%xmm1,%%xmm1 \n" 1.3859 + "movq %%xmm0," MEMACCESS(1) " \n" 1.3860 + BUNDLEALIGN 1.3861 + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 1.3862 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.3863 + "sub $0x10,%3 \n" 1.3864 + "jg 1b \n" 1.3865 + : "+r"(src_uyvy), // %0 1.3866 + "+r"(dst_u), // %1 1.3867 + "+r"(dst_v), // %2 1.3868 + "+r"(pix) // %3 1.3869 + : "r"((intptr_t)(stride_uyvy)) // %4 1.3870 + : "memory", "cc" 1.3871 +#if defined(__native_client__) && defined(__x86_64__) 1.3872 + , "r14" 1.3873 +#endif 1.3874 +#if defined(__SSE2__) 1.3875 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 1.3876 +#endif 1.3877 + ); 1.3878 +} 1.3879 + 1.3880 +void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, 1.3881 + uint8* dst_u, uint8* dst_v, int pix) { 1.3882 + asm volatile ( 1.3883 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.3884 + "psrlw $0x8,%%xmm5 \n" 1.3885 + "sub %1,%2 \n" 1.3886 + LABELALIGN 1.3887 + "1: \n" 1.3888 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.3889 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.3890 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.3891 + "pand %%xmm5,%%xmm0 \n" 1.3892 + "pand %%xmm5,%%xmm1 \n" 1.3893 + "packuswb %%xmm1,%%xmm0 \n" 1.3894 + "movdqa %%xmm0,%%xmm1 \n" 1.3895 + "pand %%xmm5,%%xmm0 \n" 1.3896 + "packuswb %%xmm0,%%xmm0 \n" 1.3897 + "psrlw $0x8,%%xmm1 \n" 1.3898 + "packuswb %%xmm1,%%xmm1 \n" 1.3899 + "movq %%xmm0," MEMACCESS(1) " \n" 1.3900 + BUNDLEALIGN 1.3901 + MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) 1.3902 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.3903 + "sub $0x10,%3 \n" 1.3904 + "jg 1b \n" 1.3905 + : "+r"(src_uyvy), // %0 1.3906 + "+r"(dst_u), // %1 1.3907 + "+r"(dst_v), // %2 1.3908 + "+r"(pix) // %3 1.3909 + : 1.3910 + : "memory", "cc" 1.3911 +#if defined(__native_client__) && defined(__x86_64__) 1.3912 + , "r14" 1.3913 +#endif 1.3914 +#if defined(__SSE2__) 1.3915 + , "xmm0", "xmm1", "xmm5" 1.3916 +#endif 1.3917 + ); 1.3918 +} 1.3919 +#endif // HAS_YUY2TOYROW_SSE2 1.3920 + 1.3921 +#ifdef HAS_ARGBBLENDROW_SSE2 1.3922 +// Blend 8 pixels at a time. 1.3923 +void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 1.3924 + uint8* dst_argb, int width) { 1.3925 + asm volatile ( 1.3926 + "pcmpeqb %%xmm7,%%xmm7 \n" 1.3927 + "psrlw $0xf,%%xmm7 \n" 1.3928 + "pcmpeqb %%xmm6,%%xmm6 \n" 1.3929 + "psrlw $0x8,%%xmm6 \n" 1.3930 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.3931 + "psllw $0x8,%%xmm5 \n" 1.3932 + "pcmpeqb %%xmm4,%%xmm4 \n" 1.3933 + "pslld $0x18,%%xmm4 \n" 1.3934 + "sub $0x1,%3 \n" 1.3935 + "je 91f \n" 1.3936 + "jl 99f \n" 1.3937 + 1.3938 + // 1 pixel loop until destination pointer is aligned. 1.3939 + "10: \n" 1.3940 + "test $0xf,%2 \n" 1.3941 + "je 19f \n" 1.3942 + "movd " MEMACCESS(0) ",%%xmm3 \n" 1.3943 + "lea " MEMLEA(0x4,0) ",%0 \n" 1.3944 + "movdqa %%xmm3,%%xmm0 \n" 1.3945 + "pxor %%xmm4,%%xmm3 \n" 1.3946 + "movd " MEMACCESS(1) ",%%xmm2 \n" 1.3947 + "psrlw $0x8,%%xmm3 \n" 1.3948 + "pshufhw $0xf5,%%xmm3,%%xmm3 \n" 1.3949 + "pshuflw $0xf5,%%xmm3,%%xmm3 \n" 1.3950 + "pand %%xmm6,%%xmm2 \n" 1.3951 + "paddw %%xmm7,%%xmm3 \n" 1.3952 + "pmullw %%xmm3,%%xmm2 \n" 1.3953 + "movd " MEMACCESS(1) ",%%xmm1 \n" 1.3954 + "lea " MEMLEA(0x4,1) ",%1 \n" 1.3955 + "psrlw $0x8,%%xmm1 \n" 1.3956 + "por %%xmm4,%%xmm0 \n" 1.3957 + "pmullw %%xmm3,%%xmm1 \n" 1.3958 + "psrlw $0x8,%%xmm2 \n" 1.3959 + "paddusb %%xmm2,%%xmm0 \n" 1.3960 + "pand %%xmm5,%%xmm1 \n" 1.3961 + "paddusb %%xmm1,%%xmm0 \n" 1.3962 + "sub $0x1,%3 \n" 1.3963 + "movd %%xmm0," MEMACCESS(2) " \n" 1.3964 + "lea " MEMLEA(0x4,2) ",%2 \n" 1.3965 + "jge 10b \n" 1.3966 + 1.3967 + "19: \n" 1.3968 + "add $1-4,%3 \n" 1.3969 + "jl 49f \n" 1.3970 + 1.3971 + // 4 pixel loop. 1.3972 + LABELALIGN 1.3973 + "41: \n" 1.3974 + "movdqu " MEMACCESS(0) ",%%xmm3 \n" 1.3975 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.3976 + "movdqa %%xmm3,%%xmm0 \n" 1.3977 + "pxor %%xmm4,%%xmm3 \n" 1.3978 + "movdqu " MEMACCESS(1) ",%%xmm2 \n" 1.3979 + "psrlw $0x8,%%xmm3 \n" 1.3980 + "pshufhw $0xf5,%%xmm3,%%xmm3 \n" 1.3981 + "pshuflw $0xf5,%%xmm3,%%xmm3 \n" 1.3982 + "pand %%xmm6,%%xmm2 \n" 1.3983 + "paddw %%xmm7,%%xmm3 \n" 1.3984 + "pmullw %%xmm3,%%xmm2 \n" 1.3985 + "movdqu " MEMACCESS(1) ",%%xmm1 \n" 1.3986 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.3987 + "psrlw $0x8,%%xmm1 \n" 1.3988 + "por %%xmm4,%%xmm0 \n" 1.3989 + "pmullw %%xmm3,%%xmm1 \n" 1.3990 + "psrlw $0x8,%%xmm2 \n" 1.3991 + "paddusb %%xmm2,%%xmm0 \n" 1.3992 + "pand %%xmm5,%%xmm1 \n" 1.3993 + "paddusb %%xmm1,%%xmm0 \n" 1.3994 + "sub $0x4,%3 \n" 1.3995 + "movdqa %%xmm0," MEMACCESS(2) " \n" 1.3996 + "lea " MEMLEA(0x10,2) ",%2 \n" 1.3997 + "jge 41b \n" 1.3998 + 1.3999 + "49: \n" 1.4000 + "add $0x3,%3 \n" 1.4001 + "jl 99f \n" 1.4002 + 1.4003 + // 1 pixel loop. 1.4004 + "91: \n" 1.4005 + "movd " MEMACCESS(0) ",%%xmm3 \n" 1.4006 + "lea " MEMLEA(0x4,0) ",%0 \n" 1.4007 + "movdqa %%xmm3,%%xmm0 \n" 1.4008 + "pxor %%xmm4,%%xmm3 \n" 1.4009 + "movd " MEMACCESS(1) ",%%xmm2 \n" 1.4010 + "psrlw $0x8,%%xmm3 \n" 1.4011 + "pshufhw $0xf5,%%xmm3,%%xmm3 \n" 1.4012 + "pshuflw $0xf5,%%xmm3,%%xmm3 \n" 1.4013 + "pand %%xmm6,%%xmm2 \n" 1.4014 + "paddw %%xmm7,%%xmm3 \n" 1.4015 + "pmullw %%xmm3,%%xmm2 \n" 1.4016 + "movd " MEMACCESS(1) ",%%xmm1 \n" 1.4017 + "lea " MEMLEA(0x4,1) ",%1 \n" 1.4018 + "psrlw $0x8,%%xmm1 \n" 1.4019 + "por %%xmm4,%%xmm0 \n" 1.4020 + "pmullw %%xmm3,%%xmm1 \n" 1.4021 + "psrlw $0x8,%%xmm2 \n" 1.4022 + "paddusb %%xmm2,%%xmm0 \n" 1.4023 + "pand %%xmm5,%%xmm1 \n" 1.4024 + "paddusb %%xmm1,%%xmm0 \n" 1.4025 + "sub $0x1,%3 \n" 1.4026 + "movd %%xmm0," MEMACCESS(2) " \n" 1.4027 + "lea " MEMLEA(0x4,2) ",%2 \n" 1.4028 + "jge 91b \n" 1.4029 + "99: \n" 1.4030 + : "+r"(src_argb0), // %0 1.4031 + "+r"(src_argb1), // %1 1.4032 + "+r"(dst_argb), // %2 1.4033 + "+r"(width) // %3 1.4034 + : 1.4035 + : "memory", "cc" 1.4036 +#if defined(__SSE2__) 1.4037 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1.4038 +#endif 1.4039 + ); 1.4040 +} 1.4041 +#endif // HAS_ARGBBLENDROW_SSE2 1.4042 + 1.4043 +#ifdef HAS_ARGBBLENDROW_SSSE3 1.4044 +// Shuffle table for isolating alpha. 1.4045 +static uvec8 kShuffleAlpha = { 1.4046 + 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 1.4047 + 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 1.4048 +}; 1.4049 + 1.4050 +// Blend 8 pixels at a time 1.4051 +// Shuffle table for reversing the bytes. 1.4052 + 1.4053 +// Same as SSE2, but replaces 1.4054 +// psrlw xmm3, 8 // alpha 1.4055 +// pshufhw xmm3, xmm3,0F5h // 8 alpha words 1.4056 +// pshuflw xmm3, xmm3,0F5h 1.4057 +// with.. 1.4058 +// pshufb xmm3, kShuffleAlpha // alpha 1.4059 + 1.4060 +void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, 1.4061 + uint8* dst_argb, int width) { 1.4062 + asm volatile ( 1.4063 + "pcmpeqb %%xmm7,%%xmm7 \n" 1.4064 + "psrlw $0xf,%%xmm7 \n" 1.4065 + "pcmpeqb %%xmm6,%%xmm6 \n" 1.4066 + "psrlw $0x8,%%xmm6 \n" 1.4067 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.4068 + "psllw $0x8,%%xmm5 \n" 1.4069 + "pcmpeqb %%xmm4,%%xmm4 \n" 1.4070 + "pslld $0x18,%%xmm4 \n" 1.4071 + "sub $0x1,%3 \n" 1.4072 + "je 91f \n" 1.4073 + "jl 99f \n" 1.4074 + 1.4075 + // 1 pixel loop until destination pointer is aligned. 1.4076 + "10: \n" 1.4077 + "test $0xf,%2 \n" 1.4078 + "je 19f \n" 1.4079 + "movd " MEMACCESS(0) ",%%xmm3 \n" 1.4080 + "lea " MEMLEA(0x4,0) ",%0 \n" 1.4081 + "movdqa %%xmm3,%%xmm0 \n" 1.4082 + "pxor %%xmm4,%%xmm3 \n" 1.4083 + "movd " MEMACCESS(1) ",%%xmm2 \n" 1.4084 + "pshufb %4,%%xmm3 \n" 1.4085 + "pand %%xmm6,%%xmm2 \n" 1.4086 + "paddw %%xmm7,%%xmm3 \n" 1.4087 + "pmullw %%xmm3,%%xmm2 \n" 1.4088 + "movd " MEMACCESS(1) ",%%xmm1 \n" 1.4089 + "lea " MEMLEA(0x4,1) ",%1 \n" 1.4090 + "psrlw $0x8,%%xmm1 \n" 1.4091 + "por %%xmm4,%%xmm0 \n" 1.4092 + "pmullw %%xmm3,%%xmm1 \n" 1.4093 + "psrlw $0x8,%%xmm2 \n" 1.4094 + "paddusb %%xmm2,%%xmm0 \n" 1.4095 + "pand %%xmm5,%%xmm1 \n" 1.4096 + "paddusb %%xmm1,%%xmm0 \n" 1.4097 + "sub $0x1,%3 \n" 1.4098 + "movd %%xmm0," MEMACCESS(2) " \n" 1.4099 + "lea " MEMLEA(0x4,2) ",%2 \n" 1.4100 + "jge 10b \n" 1.4101 + 1.4102 + "19: \n" 1.4103 + "add $1-4,%3 \n" 1.4104 + "jl 49f \n" 1.4105 + "test $0xf,%0 \n" 1.4106 + "jne 41f \n" 1.4107 + "test $0xf,%1 \n" 1.4108 + "jne 41f \n" 1.4109 + 1.4110 + // 4 pixel loop. 1.4111 + LABELALIGN 1.4112 + "40: \n" 1.4113 + "movdqa " MEMACCESS(0) ",%%xmm3 \n" 1.4114 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.4115 + "movdqa %%xmm3,%%xmm0 \n" 1.4116 + "pxor %%xmm4,%%xmm3 \n" 1.4117 + "movdqa " MEMACCESS(1) ",%%xmm2 \n" 1.4118 + "pshufb %4,%%xmm3 \n" 1.4119 + "pand %%xmm6,%%xmm2 \n" 1.4120 + "paddw %%xmm7,%%xmm3 \n" 1.4121 + "pmullw %%xmm3,%%xmm2 \n" 1.4122 + "movdqa " MEMACCESS(1) ",%%xmm1 \n" 1.4123 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.4124 + "psrlw $0x8,%%xmm1 \n" 1.4125 + "por %%xmm4,%%xmm0 \n" 1.4126 + "pmullw %%xmm3,%%xmm1 \n" 1.4127 + "psrlw $0x8,%%xmm2 \n" 1.4128 + "paddusb %%xmm2,%%xmm0 \n" 1.4129 + "pand %%xmm5,%%xmm1 \n" 1.4130 + "paddusb %%xmm1,%%xmm0 \n" 1.4131 + "sub $0x4,%3 \n" 1.4132 + "movdqa %%xmm0," MEMACCESS(2) " \n" 1.4133 + "lea " MEMLEA(0x10,2) ",%2 \n" 1.4134 + "jge 40b \n" 1.4135 + "jmp 49f \n" 1.4136 + 1.4137 + // 4 pixel unaligned loop. 1.4138 + LABELALIGN 1.4139 + "41: \n" 1.4140 + "movdqu " MEMACCESS(0) ",%%xmm3 \n" 1.4141 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.4142 + "movdqa %%xmm3,%%xmm0 \n" 1.4143 + "pxor %%xmm4,%%xmm3 \n" 1.4144 + "movdqu " MEMACCESS(1) ",%%xmm2 \n" 1.4145 + "pshufb %4,%%xmm3 \n" 1.4146 + "pand %%xmm6,%%xmm2 \n" 1.4147 + "paddw %%xmm7,%%xmm3 \n" 1.4148 + "pmullw %%xmm3,%%xmm2 \n" 1.4149 + "movdqu " MEMACCESS(1) ",%%xmm1 \n" 1.4150 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.4151 + "psrlw $0x8,%%xmm1 \n" 1.4152 + "por %%xmm4,%%xmm0 \n" 1.4153 + "pmullw %%xmm3,%%xmm1 \n" 1.4154 + "psrlw $0x8,%%xmm2 \n" 1.4155 + "paddusb %%xmm2,%%xmm0 \n" 1.4156 + "pand %%xmm5,%%xmm1 \n" 1.4157 + "paddusb %%xmm1,%%xmm0 \n" 1.4158 + "sub $0x4,%3 \n" 1.4159 + "movdqa %%xmm0," MEMACCESS(2) " \n" 1.4160 + "lea " MEMLEA(0x10,2) ",%2 \n" 1.4161 + "jge 41b \n" 1.4162 + 1.4163 + "49: \n" 1.4164 + "add $0x3,%3 \n" 1.4165 + "jl 99f \n" 1.4166 + 1.4167 + // 1 pixel loop. 1.4168 + "91: \n" 1.4169 + "movd " MEMACCESS(0) ",%%xmm3 \n" 1.4170 + "lea " MEMLEA(0x4,0) ",%0 \n" 1.4171 + "movdqa %%xmm3,%%xmm0 \n" 1.4172 + "pxor %%xmm4,%%xmm3 \n" 1.4173 + "movd " MEMACCESS(1) ",%%xmm2 \n" 1.4174 + "pshufb %4,%%xmm3 \n" 1.4175 + "pand %%xmm6,%%xmm2 \n" 1.4176 + "paddw %%xmm7,%%xmm3 \n" 1.4177 + "pmullw %%xmm3,%%xmm2 \n" 1.4178 + "movd " MEMACCESS(1) ",%%xmm1 \n" 1.4179 + "lea " MEMLEA(0x4,1) ",%1 \n" 1.4180 + "psrlw $0x8,%%xmm1 \n" 1.4181 + "por %%xmm4,%%xmm0 \n" 1.4182 + "pmullw %%xmm3,%%xmm1 \n" 1.4183 + "psrlw $0x8,%%xmm2 \n" 1.4184 + "paddusb %%xmm2,%%xmm0 \n" 1.4185 + "pand %%xmm5,%%xmm1 \n" 1.4186 + "paddusb %%xmm1,%%xmm0 \n" 1.4187 + "sub $0x1,%3 \n" 1.4188 + "movd %%xmm0," MEMACCESS(2) " \n" 1.4189 + "lea " MEMLEA(0x4,2) ",%2 \n" 1.4190 + "jge 91b \n" 1.4191 + "99: \n" 1.4192 + : "+r"(src_argb0), // %0 1.4193 + "+r"(src_argb1), // %1 1.4194 + "+r"(dst_argb), // %2 1.4195 + "+r"(width) // %3 1.4196 + : "m"(kShuffleAlpha) // %4 1.4197 + : "memory", "cc" 1.4198 +#if defined(__SSE2__) 1.4199 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1.4200 +#endif 1.4201 + ); 1.4202 +} 1.4203 +#endif // HAS_ARGBBLENDROW_SSSE3 1.4204 + 1.4205 +#ifdef HAS_ARGBATTENUATEROW_SSE2 1.4206 +// Attenuate 4 pixels at a time. 1.4207 +// aligned to 16 bytes 1.4208 +void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { 1.4209 + asm volatile ( 1.4210 + "pcmpeqb %%xmm4,%%xmm4 \n" 1.4211 + "pslld $0x18,%%xmm4 \n" 1.4212 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.4213 + "psrld $0x8,%%xmm5 \n" 1.4214 + 1.4215 + // 4 pixel loop. 1.4216 + LABELALIGN 1.4217 + "1: \n" 1.4218 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.4219 + "punpcklbw %%xmm0,%%xmm0 \n" 1.4220 + "pshufhw $0xff,%%xmm0,%%xmm2 \n" 1.4221 + "pshuflw $0xff,%%xmm2,%%xmm2 \n" 1.4222 + "pmulhuw %%xmm2,%%xmm0 \n" 1.4223 + "movdqa " MEMACCESS(0) ",%%xmm1 \n" 1.4224 + "punpckhbw %%xmm1,%%xmm1 \n" 1.4225 + "pshufhw $0xff,%%xmm1,%%xmm2 \n" 1.4226 + "pshuflw $0xff,%%xmm2,%%xmm2 \n" 1.4227 + "pmulhuw %%xmm2,%%xmm1 \n" 1.4228 + "movdqa " MEMACCESS(0) ",%%xmm2 \n" 1.4229 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.4230 + "psrlw $0x8,%%xmm0 \n" 1.4231 + "pand %%xmm4,%%xmm2 \n" 1.4232 + "psrlw $0x8,%%xmm1 \n" 1.4233 + "packuswb %%xmm1,%%xmm0 \n" 1.4234 + "pand %%xmm5,%%xmm0 \n" 1.4235 + "por %%xmm2,%%xmm0 \n" 1.4236 + "sub $0x4,%2 \n" 1.4237 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.4238 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.4239 + "jg 1b \n" 1.4240 + : "+r"(src_argb), // %0 1.4241 + "+r"(dst_argb), // %1 1.4242 + "+r"(width) // %2 1.4243 + : 1.4244 + : "memory", "cc" 1.4245 +#if defined(__SSE2__) 1.4246 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.4247 +#endif 1.4248 + ); 1.4249 +} 1.4250 +#endif // HAS_ARGBATTENUATEROW_SSE2 1.4251 + 1.4252 +#ifdef HAS_ARGBATTENUATEROW_SSSE3 1.4253 +// Shuffle table duplicating alpha 1.4254 +static uvec8 kShuffleAlpha0 = { 1.4255 + 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, 1.4256 +}; 1.4257 +static uvec8 kShuffleAlpha1 = { 1.4258 + 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 1.4259 + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, 1.4260 +}; 1.4261 +// Attenuate 4 pixels at a time. 1.4262 +// aligned to 16 bytes 1.4263 +void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 1.4264 + asm volatile ( 1.4265 + "pcmpeqb %%xmm3,%%xmm3 \n" 1.4266 + "pslld $0x18,%%xmm3 \n" 1.4267 + "movdqa %3,%%xmm4 \n" 1.4268 + "movdqa %4,%%xmm5 \n" 1.4269 + 1.4270 + // 4 pixel loop. 1.4271 + LABELALIGN 1.4272 + "1: \n" 1.4273 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.4274 + "pshufb %%xmm4,%%xmm0 \n" 1.4275 + "movdqu " MEMACCESS(0) ",%%xmm1 \n" 1.4276 + "punpcklbw %%xmm1,%%xmm1 \n" 1.4277 + "pmulhuw %%xmm1,%%xmm0 \n" 1.4278 + "movdqu " MEMACCESS(0) ",%%xmm1 \n" 1.4279 + "pshufb %%xmm5,%%xmm1 \n" 1.4280 + "movdqu " MEMACCESS(0) ",%%xmm2 \n" 1.4281 + "punpckhbw %%xmm2,%%xmm2 \n" 1.4282 + "pmulhuw %%xmm2,%%xmm1 \n" 1.4283 + "movdqu " MEMACCESS(0) ",%%xmm2 \n" 1.4284 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.4285 + "pand %%xmm3,%%xmm2 \n" 1.4286 + "psrlw $0x8,%%xmm0 \n" 1.4287 + "psrlw $0x8,%%xmm1 \n" 1.4288 + "packuswb %%xmm1,%%xmm0 \n" 1.4289 + "por %%xmm2,%%xmm0 \n" 1.4290 + "sub $0x4,%2 \n" 1.4291 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.4292 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.4293 + "jg 1b \n" 1.4294 + : "+r"(src_argb), // %0 1.4295 + "+r"(dst_argb), // %1 1.4296 + "+r"(width) // %2 1.4297 + : "m"(kShuffleAlpha0), // %3 1.4298 + "m"(kShuffleAlpha1) // %4 1.4299 + : "memory", "cc" 1.4300 +#if defined(__SSE2__) 1.4301 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.4302 +#endif 1.4303 + ); 1.4304 +} 1.4305 +#endif // HAS_ARGBATTENUATEROW_SSSE3 1.4306 + 1.4307 +#ifdef HAS_ARGBUNATTENUATEROW_SSE2 1.4308 +// Unattenuate 4 pixels at a time. 1.4309 +// aligned to 16 bytes 1.4310 +void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, 1.4311 + int width) { 1.4312 + uintptr_t alpha = 0; 1.4313 + asm volatile ( 1.4314 + // 4 pixel loop. 1.4315 + LABELALIGN 1.4316 + "1: \n" 1.4317 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.4318 + "movzb " MEMACCESS2(0x03,0) ",%3 \n" 1.4319 + "punpcklbw %%xmm0,%%xmm0 \n" 1.4320 + MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 1.4321 + "movzb " MEMACCESS2(0x07,0) ",%3 \n" 1.4322 + MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 1.4323 + "pshuflw $0x40,%%xmm2,%%xmm2 \n" 1.4324 + "pshuflw $0x40,%%xmm3,%%xmm3 \n" 1.4325 + "movlhps %%xmm3,%%xmm2 \n" 1.4326 + "pmulhuw %%xmm2,%%xmm0 \n" 1.4327 + "movdqu " MEMACCESS(0) ",%%xmm1 \n" 1.4328 + "movzb " MEMACCESS2(0x0b,0) ",%3 \n" 1.4329 + "punpckhbw %%xmm1,%%xmm1 \n" 1.4330 + BUNDLEALIGN 1.4331 + MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 1.4332 + "movzb " MEMACCESS2(0x0f,0) ",%3 \n" 1.4333 + MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 1.4334 + "pshuflw $0x40,%%xmm2,%%xmm2 \n" 1.4335 + "pshuflw $0x40,%%xmm3,%%xmm3 \n" 1.4336 + "movlhps %%xmm3,%%xmm2 \n" 1.4337 + "pmulhuw %%xmm2,%%xmm1 \n" 1.4338 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.4339 + "packuswb %%xmm1,%%xmm0 \n" 1.4340 + "sub $0x4,%2 \n" 1.4341 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.4342 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.4343 + "jg 1b \n" 1.4344 + : "+r"(src_argb), // %0 1.4345 + "+r"(dst_argb), // %1 1.4346 + "+r"(width), // %2 1.4347 + "+r"(alpha) // %3 1.4348 + : "r"(fixed_invtbl8) // %4 1.4349 + : "memory", "cc" 1.4350 +#if defined(__native_client__) && defined(__x86_64__) 1.4351 + , "r14" 1.4352 +#endif 1.4353 +#if defined(__SSE2__) 1.4354 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.4355 +#endif 1.4356 + ); 1.4357 +} 1.4358 +#endif // HAS_ARGBUNATTENUATEROW_SSE2 1.4359 + 1.4360 +#ifdef HAS_ARGBGRAYROW_SSSE3 1.4361 +// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels 1.4362 +void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 1.4363 + asm volatile ( 1.4364 + "movdqa %3,%%xmm4 \n" 1.4365 + "movdqa %4,%%xmm5 \n" 1.4366 + 1.4367 + // 8 pixel loop. 1.4368 + LABELALIGN 1.4369 + "1: \n" 1.4370 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.4371 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.4372 + "pmaddubsw %%xmm4,%%xmm0 \n" 1.4373 + "pmaddubsw %%xmm4,%%xmm1 \n" 1.4374 + "phaddw %%xmm1,%%xmm0 \n" 1.4375 + "paddw %%xmm5,%%xmm0 \n" 1.4376 + "psrlw $0x7,%%xmm0 \n" 1.4377 + "packuswb %%xmm0,%%xmm0 \n" 1.4378 + "movdqa " MEMACCESS(0) ",%%xmm2 \n" 1.4379 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" 1.4380 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.4381 + "psrld $0x18,%%xmm2 \n" 1.4382 + "psrld $0x18,%%xmm3 \n" 1.4383 + "packuswb %%xmm3,%%xmm2 \n" 1.4384 + "packuswb %%xmm2,%%xmm2 \n" 1.4385 + "movdqa %%xmm0,%%xmm3 \n" 1.4386 + "punpcklbw %%xmm0,%%xmm0 \n" 1.4387 + "punpcklbw %%xmm2,%%xmm3 \n" 1.4388 + "movdqa %%xmm0,%%xmm1 \n" 1.4389 + "punpcklwd %%xmm3,%%xmm0 \n" 1.4390 + "punpckhwd %%xmm3,%%xmm1 \n" 1.4391 + "sub $0x8,%2 \n" 1.4392 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.4393 + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" 1.4394 + "lea " MEMLEA(0x20,1) ",%1 \n" 1.4395 + "jg 1b \n" 1.4396 + : "+r"(src_argb), // %0 1.4397 + "+r"(dst_argb), // %1 1.4398 + "+r"(width) // %2 1.4399 + : "m"(kARGBToYJ), // %3 1.4400 + "m"(kAddYJ64) // %4 1.4401 + : "memory", "cc" 1.4402 +#if defined(__SSE2__) 1.4403 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.4404 +#endif 1.4405 + ); 1.4406 +} 1.4407 +#endif // HAS_ARGBGRAYROW_SSSE3 1.4408 + 1.4409 +#ifdef HAS_ARGBSEPIAROW_SSSE3 1.4410 +// b = (r * 35 + g * 68 + b * 17) >> 7 1.4411 +// g = (r * 45 + g * 88 + b * 22) >> 7 1.4412 +// r = (r * 50 + g * 98 + b * 24) >> 7 1.4413 +// Constant for ARGB color to sepia tone 1.4414 +static vec8 kARGBToSepiaB = { 1.4415 + 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 1.4416 +}; 1.4417 + 1.4418 +static vec8 kARGBToSepiaG = { 1.4419 + 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 1.4420 +}; 1.4421 + 1.4422 +static vec8 kARGBToSepiaR = { 1.4423 + 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 1.4424 +}; 1.4425 + 1.4426 +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 1.4427 +void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { 1.4428 + asm volatile ( 1.4429 + "movdqa %2,%%xmm2 \n" 1.4430 + "movdqa %3,%%xmm3 \n" 1.4431 + "movdqa %4,%%xmm4 \n" 1.4432 + 1.4433 + // 8 pixel loop. 1.4434 + LABELALIGN 1.4435 + "1: \n" 1.4436 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.4437 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" 1.4438 + "pmaddubsw %%xmm2,%%xmm0 \n" 1.4439 + "pmaddubsw %%xmm2,%%xmm6 \n" 1.4440 + "phaddw %%xmm6,%%xmm0 \n" 1.4441 + "psrlw $0x7,%%xmm0 \n" 1.4442 + "packuswb %%xmm0,%%xmm0 \n" 1.4443 + "movdqa " MEMACCESS(0) ",%%xmm5 \n" 1.4444 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.4445 + "pmaddubsw %%xmm3,%%xmm5 \n" 1.4446 + "pmaddubsw %%xmm3,%%xmm1 \n" 1.4447 + "phaddw %%xmm1,%%xmm5 \n" 1.4448 + "psrlw $0x7,%%xmm5 \n" 1.4449 + "packuswb %%xmm5,%%xmm5 \n" 1.4450 + "punpcklbw %%xmm5,%%xmm0 \n" 1.4451 + "movdqa " MEMACCESS(0) ",%%xmm5 \n" 1.4452 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.4453 + "pmaddubsw %%xmm4,%%xmm5 \n" 1.4454 + "pmaddubsw %%xmm4,%%xmm1 \n" 1.4455 + "phaddw %%xmm1,%%xmm5 \n" 1.4456 + "psrlw $0x7,%%xmm5 \n" 1.4457 + "packuswb %%xmm5,%%xmm5 \n" 1.4458 + "movdqa " MEMACCESS(0) ",%%xmm6 \n" 1.4459 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.4460 + "psrld $0x18,%%xmm6 \n" 1.4461 + "psrld $0x18,%%xmm1 \n" 1.4462 + "packuswb %%xmm1,%%xmm6 \n" 1.4463 + "packuswb %%xmm6,%%xmm6 \n" 1.4464 + "punpcklbw %%xmm6,%%xmm5 \n" 1.4465 + "movdqa %%xmm0,%%xmm1 \n" 1.4466 + "punpcklwd %%xmm5,%%xmm0 \n" 1.4467 + "punpckhwd %%xmm5,%%xmm1 \n" 1.4468 + "sub $0x8,%1 \n" 1.4469 + "movdqa %%xmm0," MEMACCESS(0) " \n" 1.4470 + "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" 1.4471 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.4472 + "jg 1b \n" 1.4473 + : "+r"(dst_argb), // %0 1.4474 + "+r"(width) // %1 1.4475 + : "m"(kARGBToSepiaB), // %2 1.4476 + "m"(kARGBToSepiaG), // %3 1.4477 + "m"(kARGBToSepiaR) // %4 1.4478 + : "memory", "cc" 1.4479 +#if defined(__SSE2__) 1.4480 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 1.4481 +#endif 1.4482 + ); 1.4483 +} 1.4484 +#endif // HAS_ARGBSEPIAROW_SSSE3 1.4485 + 1.4486 +#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 1.4487 +// Tranform 8 ARGB pixels (32 bytes) with color matrix. 1.4488 +// Same as Sepia except matrix is provided. 1.4489 +void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 1.4490 + const int8* matrix_argb, int width) { 1.4491 + asm volatile ( 1.4492 + "movdqu " MEMACCESS(3) ",%%xmm5 \n" 1.4493 + "pshufd $0x00,%%xmm5,%%xmm2 \n" 1.4494 + "pshufd $0x55,%%xmm5,%%xmm3 \n" 1.4495 + "pshufd $0xaa,%%xmm5,%%xmm4 \n" 1.4496 + "pshufd $0xff,%%xmm5,%%xmm5 \n" 1.4497 + 1.4498 + // 8 pixel loop. 1.4499 + LABELALIGN 1.4500 + "1: \n" 1.4501 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.4502 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" 1.4503 + "pmaddubsw %%xmm2,%%xmm0 \n" 1.4504 + "pmaddubsw %%xmm2,%%xmm7 \n" 1.4505 + "movdqa " MEMACCESS(0) ",%%xmm6 \n" 1.4506 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.4507 + "pmaddubsw %%xmm3,%%xmm6 \n" 1.4508 + "pmaddubsw %%xmm3,%%xmm1 \n" 1.4509 + "phaddsw %%xmm7,%%xmm0 \n" 1.4510 + "phaddsw %%xmm1,%%xmm6 \n" 1.4511 + "psraw $0x6,%%xmm0 \n" 1.4512 + "psraw $0x6,%%xmm6 \n" 1.4513 + "packuswb %%xmm0,%%xmm0 \n" 1.4514 + "packuswb %%xmm6,%%xmm6 \n" 1.4515 + "punpcklbw %%xmm6,%%xmm0 \n" 1.4516 + "movdqa " MEMACCESS(0) ",%%xmm1 \n" 1.4517 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" 1.4518 + "pmaddubsw %%xmm4,%%xmm1 \n" 1.4519 + "pmaddubsw %%xmm4,%%xmm7 \n" 1.4520 + "phaddsw %%xmm7,%%xmm1 \n" 1.4521 + "movdqa " MEMACCESS(0) ",%%xmm6 \n" 1.4522 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" 1.4523 + "pmaddubsw %%xmm5,%%xmm6 \n" 1.4524 + "pmaddubsw %%xmm5,%%xmm7 \n" 1.4525 + "phaddsw %%xmm7,%%xmm6 \n" 1.4526 + "psraw $0x6,%%xmm1 \n" 1.4527 + "psraw $0x6,%%xmm6 \n" 1.4528 + "packuswb %%xmm1,%%xmm1 \n" 1.4529 + "packuswb %%xmm6,%%xmm6 \n" 1.4530 + "punpcklbw %%xmm6,%%xmm1 \n" 1.4531 + "movdqa %%xmm0,%%xmm6 \n" 1.4532 + "punpcklwd %%xmm1,%%xmm0 \n" 1.4533 + "punpckhwd %%xmm1,%%xmm6 \n" 1.4534 + "sub $0x8,%2 \n" 1.4535 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.4536 + "movdqa %%xmm6," MEMACCESS2(0x10,1) " \n" 1.4537 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.4538 + "lea " MEMLEA(0x20,1) ",%1 \n" 1.4539 + "jg 1b \n" 1.4540 + : "+r"(src_argb), // %0 1.4541 + "+r"(dst_argb), // %1 1.4542 + "+r"(width) // %2 1.4543 + : "r"(matrix_argb) // %3 1.4544 + : "memory", "cc" 1.4545 +#if defined(__SSE2__) 1.4546 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1.4547 +#endif 1.4548 + ); 1.4549 +} 1.4550 +#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 1.4551 + 1.4552 +#ifdef HAS_ARGBQUANTIZEROW_SSE2 1.4553 +// Quantize 4 ARGB pixels (16 bytes). 1.4554 +// aligned to 16 bytes 1.4555 +void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, 1.4556 + int interval_offset, int width) { 1.4557 + asm volatile ( 1.4558 + "movd %2,%%xmm2 \n" 1.4559 + "movd %3,%%xmm3 \n" 1.4560 + "movd %4,%%xmm4 \n" 1.4561 + "pshuflw $0x40,%%xmm2,%%xmm2 \n" 1.4562 + "pshufd $0x44,%%xmm2,%%xmm2 \n" 1.4563 + "pshuflw $0x40,%%xmm3,%%xmm3 \n" 1.4564 + "pshufd $0x44,%%xmm3,%%xmm3 \n" 1.4565 + "pshuflw $0x40,%%xmm4,%%xmm4 \n" 1.4566 + "pshufd $0x44,%%xmm4,%%xmm4 \n" 1.4567 + "pxor %%xmm5,%%xmm5 \n" 1.4568 + "pcmpeqb %%xmm6,%%xmm6 \n" 1.4569 + "pslld $0x18,%%xmm6 \n" 1.4570 + 1.4571 + // 4 pixel loop. 1.4572 + LABELALIGN 1.4573 + "1: \n" 1.4574 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.4575 + "punpcklbw %%xmm5,%%xmm0 \n" 1.4576 + "pmulhuw %%xmm2,%%xmm0 \n" 1.4577 + "movdqa " MEMACCESS(0) ",%%xmm1 \n" 1.4578 + "punpckhbw %%xmm5,%%xmm1 \n" 1.4579 + "pmulhuw %%xmm2,%%xmm1 \n" 1.4580 + "pmullw %%xmm3,%%xmm0 \n" 1.4581 + "movdqa " MEMACCESS(0) ",%%xmm7 \n" 1.4582 + "pmullw %%xmm3,%%xmm1 \n" 1.4583 + "pand %%xmm6,%%xmm7 \n" 1.4584 + "paddw %%xmm4,%%xmm0 \n" 1.4585 + "paddw %%xmm4,%%xmm1 \n" 1.4586 + "packuswb %%xmm1,%%xmm0 \n" 1.4587 + "por %%xmm7,%%xmm0 \n" 1.4588 + "sub $0x4,%1 \n" 1.4589 + "movdqa %%xmm0," MEMACCESS(0) " \n" 1.4590 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.4591 + "jg 1b \n" 1.4592 + : "+r"(dst_argb), // %0 1.4593 + "+r"(width) // %1 1.4594 + : "r"(scale), // %2 1.4595 + "r"(interval_size), // %3 1.4596 + "r"(interval_offset) // %4 1.4597 + : "memory", "cc" 1.4598 +#if defined(__SSE2__) 1.4599 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1.4600 +#endif 1.4601 + ); 1.4602 +} 1.4603 +#endif // HAS_ARGBQUANTIZEROW_SSE2 1.4604 + 1.4605 +#ifdef HAS_ARGBSHADEROW_SSE2 1.4606 +// Shade 4 pixels at a time by specified value. 1.4607 +// Aligned to 16 bytes. 1.4608 +void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, 1.4609 + uint32 value) { 1.4610 + asm volatile ( 1.4611 + "movd %3,%%xmm2 \n" 1.4612 + "punpcklbw %%xmm2,%%xmm2 \n" 1.4613 + "punpcklqdq %%xmm2,%%xmm2 \n" 1.4614 + 1.4615 + // 4 pixel loop. 1.4616 + LABELALIGN 1.4617 + "1: \n" 1.4618 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.4619 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.4620 + "movdqa %%xmm0,%%xmm1 \n" 1.4621 + "punpcklbw %%xmm0,%%xmm0 \n" 1.4622 + "punpckhbw %%xmm1,%%xmm1 \n" 1.4623 + "pmulhuw %%xmm2,%%xmm0 \n" 1.4624 + "pmulhuw %%xmm2,%%xmm1 \n" 1.4625 + "psrlw $0x8,%%xmm0 \n" 1.4626 + "psrlw $0x8,%%xmm1 \n" 1.4627 + "packuswb %%xmm1,%%xmm0 \n" 1.4628 + "sub $0x4,%2 \n" 1.4629 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.4630 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.4631 + "jg 1b \n" 1.4632 + : "+r"(src_argb), // %0 1.4633 + "+r"(dst_argb), // %1 1.4634 + "+r"(width) // %2 1.4635 + : "r"(value) // %3 1.4636 + : "memory", "cc" 1.4637 +#if defined(__SSE2__) 1.4638 + , "xmm0", "xmm1", "xmm2" 1.4639 +#endif 1.4640 + ); 1.4641 +} 1.4642 +#endif // HAS_ARGBSHADEROW_SSE2 1.4643 + 1.4644 +#ifdef HAS_ARGBMULTIPLYROW_SSE2 1.4645 +// Multiply 2 rows of ARGB pixels together, 4 pixels at a time. 1.4646 +void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 1.4647 + uint8* dst_argb, int width) { 1.4648 + asm volatile ( 1.4649 + "pxor %%xmm5,%%xmm5 \n" 1.4650 + 1.4651 + // 4 pixel loop. 1.4652 + LABELALIGN 1.4653 + "1: \n" 1.4654 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.4655 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.4656 + "movdqu " MEMACCESS(1) ",%%xmm2 \n" 1.4657 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.4658 + "movdqu %%xmm0,%%xmm1 \n" 1.4659 + "movdqu %%xmm2,%%xmm3 \n" 1.4660 + "punpcklbw %%xmm0,%%xmm0 \n" 1.4661 + "punpckhbw %%xmm1,%%xmm1 \n" 1.4662 + "punpcklbw %%xmm5,%%xmm2 \n" 1.4663 + "punpckhbw %%xmm5,%%xmm3 \n" 1.4664 + "pmulhuw %%xmm2,%%xmm0 \n" 1.4665 + "pmulhuw %%xmm3,%%xmm1 \n" 1.4666 + "packuswb %%xmm1,%%xmm0 \n" 1.4667 + "sub $0x4,%3 \n" 1.4668 + "movdqu %%xmm0," MEMACCESS(2) " \n" 1.4669 + "lea " MEMLEA(0x10,2) ",%2 \n" 1.4670 + "jg 1b \n" 1.4671 + : "+r"(src_argb0), // %0 1.4672 + "+r"(src_argb1), // %1 1.4673 + "+r"(dst_argb), // %2 1.4674 + "+r"(width) // %3 1.4675 + : 1.4676 + : "memory", "cc" 1.4677 +#if defined(__SSE2__) 1.4678 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 1.4679 +#endif 1.4680 + ); 1.4681 +} 1.4682 +#endif // HAS_ARGBMULTIPLYROW_SSE2 1.4683 + 1.4684 +#ifdef HAS_ARGBADDROW_SSE2 1.4685 +// Add 2 rows of ARGB pixels together, 4 pixels at a time. 1.4686 +void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 1.4687 + uint8* dst_argb, int width) { 1.4688 + asm volatile ( 1.4689 + // 4 pixel loop. 1.4690 + LABELALIGN 1.4691 + "1: \n" 1.4692 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.4693 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.4694 + "movdqu " MEMACCESS(1) ",%%xmm1 \n" 1.4695 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.4696 + "paddusb %%xmm1,%%xmm0 \n" 1.4697 + "sub $0x4,%3 \n" 1.4698 + "movdqu %%xmm0," MEMACCESS(2) " \n" 1.4699 + "lea " MEMLEA(0x10,2) ",%2 \n" 1.4700 + "jg 1b \n" 1.4701 + : "+r"(src_argb0), // %0 1.4702 + "+r"(src_argb1), // %1 1.4703 + "+r"(dst_argb), // %2 1.4704 + "+r"(width) // %3 1.4705 + : 1.4706 + : "memory", "cc" 1.4707 +#if defined(__SSE2__) 1.4708 + , "xmm0", "xmm1" 1.4709 +#endif 1.4710 + ); 1.4711 +} 1.4712 +#endif // HAS_ARGBADDROW_SSE2 1.4713 + 1.4714 +#ifdef HAS_ARGBSUBTRACTROW_SSE2 1.4715 +// Subtract 2 rows of ARGB pixels, 4 pixels at a time. 1.4716 +void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 1.4717 + uint8* dst_argb, int width) { 1.4718 + asm volatile ( 1.4719 + // 4 pixel loop. 1.4720 + LABELALIGN 1.4721 + "1: \n" 1.4722 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.4723 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.4724 + "movdqu " MEMACCESS(1) ",%%xmm1 \n" 1.4725 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.4726 + "psubusb %%xmm1,%%xmm0 \n" 1.4727 + "sub $0x4,%3 \n" 1.4728 + "movdqu %%xmm0," MEMACCESS(2) " \n" 1.4729 + "lea " MEMLEA(0x10,2) ",%2 \n" 1.4730 + "jg 1b \n" 1.4731 + : "+r"(src_argb0), // %0 1.4732 + "+r"(src_argb1), // %1 1.4733 + "+r"(dst_argb), // %2 1.4734 + "+r"(width) // %3 1.4735 + : 1.4736 + : "memory", "cc" 1.4737 +#if defined(__SSE2__) 1.4738 + , "xmm0", "xmm1" 1.4739 +#endif 1.4740 + ); 1.4741 +} 1.4742 +#endif // HAS_ARGBSUBTRACTROW_SSE2 1.4743 + 1.4744 +#ifdef HAS_SOBELXROW_SSE2 1.4745 +// SobelX as a matrix is 1.4746 +// -1 0 1 1.4747 +// -2 0 2 1.4748 +// -1 0 1 1.4749 +void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, 1.4750 + const uint8* src_y2, uint8* dst_sobelx, int width) { 1.4751 + asm volatile ( 1.4752 + "sub %0,%1 \n" 1.4753 + "sub %0,%2 \n" 1.4754 + "sub %0,%3 \n" 1.4755 + "pxor %%xmm5,%%xmm5 \n" 1.4756 + 1.4757 + // 8 pixel loop. 1.4758 + LABELALIGN 1.4759 + "1: \n" 1.4760 + "movq " MEMACCESS(0) ",%%xmm0 \n" 1.4761 + "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n" 1.4762 + "punpcklbw %%xmm5,%%xmm0 \n" 1.4763 + "punpcklbw %%xmm5,%%xmm1 \n" 1.4764 + "psubw %%xmm1,%%xmm0 \n" 1.4765 + BUNDLEALIGN 1.4766 + MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 1.4767 + MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2 1.4768 + "punpcklbw %%xmm5,%%xmm1 \n" 1.4769 + "punpcklbw %%xmm5,%%xmm2 \n" 1.4770 + "psubw %%xmm2,%%xmm1 \n" 1.4771 + BUNDLEALIGN 1.4772 + MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2 1.4773 + MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3 1.4774 + "punpcklbw %%xmm5,%%xmm2 \n" 1.4775 + "punpcklbw %%xmm5,%%xmm3 \n" 1.4776 + "psubw %%xmm3,%%xmm2 \n" 1.4777 + "paddw %%xmm2,%%xmm0 \n" 1.4778 + "paddw %%xmm1,%%xmm0 \n" 1.4779 + "paddw %%xmm1,%%xmm0 \n" 1.4780 + "pxor %%xmm1,%%xmm1 \n" 1.4781 + "psubw %%xmm0,%%xmm1 \n" 1.4782 + "pmaxsw %%xmm1,%%xmm0 \n" 1.4783 + "packuswb %%xmm0,%%xmm0 \n" 1.4784 + "sub $0x8,%4 \n" 1.4785 + BUNDLEALIGN 1.4786 + MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1) 1.4787 + "lea " MEMLEA(0x8,0) ",%0 \n" 1.4788 + "jg 1b \n" 1.4789 + : "+r"(src_y0), // %0 1.4790 + "+r"(src_y1), // %1 1.4791 + "+r"(src_y2), // %2 1.4792 + "+r"(dst_sobelx), // %3 1.4793 + "+r"(width) // %4 1.4794 + : 1.4795 + : "memory", "cc" 1.4796 +#if defined(__native_client__) && defined(__x86_64__) 1.4797 + , "r14" 1.4798 +#endif 1.4799 +#if defined(__SSE2__) 1.4800 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 1.4801 +#endif 1.4802 + ); 1.4803 +} 1.4804 +#endif // HAS_SOBELXROW_SSE2 1.4805 + 1.4806 +#ifdef HAS_SOBELYROW_SSE2 1.4807 +// SobelY as a matrix is 1.4808 +// -1 -2 -1 1.4809 +// 0 0 0 1.4810 +// 1 2 1 1.4811 +void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, 1.4812 + uint8* dst_sobely, int width) { 1.4813 + asm volatile ( 1.4814 + "sub %0,%1 \n" 1.4815 + "sub %0,%2 \n" 1.4816 + "pxor %%xmm5,%%xmm5 \n" 1.4817 + 1.4818 + // 8 pixel loop. 1.4819 + LABELALIGN 1.4820 + "1: \n" 1.4821 + "movq " MEMACCESS(0) ",%%xmm0 \n" 1.4822 + MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 1.4823 + "punpcklbw %%xmm5,%%xmm0 \n" 1.4824 + "punpcklbw %%xmm5,%%xmm1 \n" 1.4825 + "psubw %%xmm1,%%xmm0 \n" 1.4826 + BUNDLEALIGN 1.4827 + "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n" 1.4828 + MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2 1.4829 + "punpcklbw %%xmm5,%%xmm1 \n" 1.4830 + "punpcklbw %%xmm5,%%xmm2 \n" 1.4831 + "psubw %%xmm2,%%xmm1 \n" 1.4832 + BUNDLEALIGN 1.4833 + "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n" 1.4834 + MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3 1.4835 + "punpcklbw %%xmm5,%%xmm2 \n" 1.4836 + "punpcklbw %%xmm5,%%xmm3 \n" 1.4837 + "psubw %%xmm3,%%xmm2 \n" 1.4838 + "paddw %%xmm2,%%xmm0 \n" 1.4839 + "paddw %%xmm1,%%xmm0 \n" 1.4840 + "paddw %%xmm1,%%xmm0 \n" 1.4841 + "pxor %%xmm1,%%xmm1 \n" 1.4842 + "psubw %%xmm0,%%xmm1 \n" 1.4843 + "pmaxsw %%xmm1,%%xmm0 \n" 1.4844 + "packuswb %%xmm0,%%xmm0 \n" 1.4845 + "sub $0x8,%3 \n" 1.4846 + BUNDLEALIGN 1.4847 + MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1) 1.4848 + "lea " MEMLEA(0x8,0) ",%0 \n" 1.4849 + "jg 1b \n" 1.4850 + : "+r"(src_y0), // %0 1.4851 + "+r"(src_y1), // %1 1.4852 + "+r"(dst_sobely), // %2 1.4853 + "+r"(width) // %3 1.4854 + : 1.4855 + : "memory", "cc" 1.4856 +#if defined(__native_client__) && defined(__x86_64__) 1.4857 + , "r14" 1.4858 +#endif 1.4859 +#if defined(__SSE2__) 1.4860 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 1.4861 +#endif 1.4862 + ); 1.4863 +} 1.4864 +#endif // HAS_SOBELYROW_SSE2 1.4865 + 1.4866 +#ifdef HAS_SOBELROW_SSE2 1.4867 +// Adds Sobel X and Sobel Y and stores Sobel into ARGB. 1.4868 +// A = 255 1.4869 +// R = Sobel 1.4870 +// G = Sobel 1.4871 +// B = Sobel 1.4872 +void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 1.4873 + uint8* dst_argb, int width) { 1.4874 + asm volatile ( 1.4875 + "sub %0,%1 \n" 1.4876 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.4877 + "pslld $0x18,%%xmm5 \n" 1.4878 + 1.4879 + // 8 pixel loop. 1.4880 + LABELALIGN 1.4881 + "1: \n" 1.4882 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.4883 + MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 1.4884 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.4885 + "paddusb %%xmm1,%%xmm0 \n" 1.4886 + "movdqa %%xmm0,%%xmm2 \n" 1.4887 + "punpcklbw %%xmm0,%%xmm2 \n" 1.4888 + "punpckhbw %%xmm0,%%xmm0 \n" 1.4889 + "movdqa %%xmm2,%%xmm1 \n" 1.4890 + "punpcklwd %%xmm2,%%xmm1 \n" 1.4891 + "punpckhwd %%xmm2,%%xmm2 \n" 1.4892 + "por %%xmm5,%%xmm1 \n" 1.4893 + "por %%xmm5,%%xmm2 \n" 1.4894 + "movdqa %%xmm0,%%xmm3 \n" 1.4895 + "punpcklwd %%xmm0,%%xmm3 \n" 1.4896 + "punpckhwd %%xmm0,%%xmm0 \n" 1.4897 + "por %%xmm5,%%xmm3 \n" 1.4898 + "por %%xmm5,%%xmm0 \n" 1.4899 + "sub $0x10,%3 \n" 1.4900 + "movdqa %%xmm1," MEMACCESS(2) " \n" 1.4901 + "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" 1.4902 + "movdqa %%xmm3," MEMACCESS2(0x20,2) " \n" 1.4903 + "movdqa %%xmm0," MEMACCESS2(0x30,2) " \n" 1.4904 + "lea " MEMLEA(0x40,2) ",%2 \n" 1.4905 + "jg 1b \n" 1.4906 + : "+r"(src_sobelx), // %0 1.4907 + "+r"(src_sobely), // %1 1.4908 + "+r"(dst_argb), // %2 1.4909 + "+r"(width) // %3 1.4910 + : 1.4911 + : "memory", "cc" 1.4912 +#if defined(__native_client__) && defined(__x86_64__) 1.4913 + , "r14" 1.4914 +#endif 1.4915 +#if defined(__SSE2__) 1.4916 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 1.4917 +#endif 1.4918 + ); 1.4919 +} 1.4920 +#endif // HAS_SOBELROW_SSE2 1.4921 + 1.4922 +#ifdef HAS_SOBELTOPLANEROW_SSE2 1.4923 +// Adds Sobel X and Sobel Y and stores Sobel into a plane. 1.4924 +void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 1.4925 + uint8* dst_y, int width) { 1.4926 + asm volatile ( 1.4927 + "sub %0,%1 \n" 1.4928 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.4929 + "pslld $0x18,%%xmm5 \n" 1.4930 + 1.4931 + // 8 pixel loop. 1.4932 + LABELALIGN 1.4933 + "1: \n" 1.4934 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.4935 + MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 1.4936 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.4937 + "paddusb %%xmm1,%%xmm0 \n" 1.4938 + "sub $0x10,%3 \n" 1.4939 + "movdqa %%xmm0," MEMACCESS(2) " \n" 1.4940 + "lea " MEMLEA(0x10,2) ",%2 \n" 1.4941 + "jg 1b \n" 1.4942 + : "+r"(src_sobelx), // %0 1.4943 + "+r"(src_sobely), // %1 1.4944 + "+r"(dst_y), // %2 1.4945 + "+r"(width) // %3 1.4946 + : 1.4947 + : "memory", "cc" 1.4948 +#if defined(__native_client__) && defined(__x86_64__) 1.4949 + , "r14" 1.4950 +#endif 1.4951 +#if defined(__SSE2__) 1.4952 + , "xmm0", "xmm1" 1.4953 +#endif 1.4954 + ); 1.4955 +} 1.4956 +#endif // HAS_SOBELTOPLANEROW_SSE2 1.4957 + 1.4958 +#ifdef HAS_SOBELXYROW_SSE2 1.4959 +// Mixes Sobel X, Sobel Y and Sobel into ARGB. 1.4960 +// A = 255 1.4961 +// R = Sobel X 1.4962 +// G = Sobel 1.4963 +// B = Sobel Y 1.4964 +void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, 1.4965 + uint8* dst_argb, int width) { 1.4966 + asm volatile ( 1.4967 + "sub %0,%1 \n" 1.4968 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.4969 + 1.4970 + // 8 pixel loop. 1.4971 + LABELALIGN 1.4972 + "1: \n" 1.4973 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.4974 + MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 1.4975 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.4976 + "movdqa %%xmm0,%%xmm2 \n" 1.4977 + "paddusb %%xmm1,%%xmm2 \n" 1.4978 + "movdqa %%xmm0,%%xmm3 \n" 1.4979 + "punpcklbw %%xmm5,%%xmm3 \n" 1.4980 + "punpckhbw %%xmm5,%%xmm0 \n" 1.4981 + "movdqa %%xmm1,%%xmm4 \n" 1.4982 + "punpcklbw %%xmm2,%%xmm4 \n" 1.4983 + "punpckhbw %%xmm2,%%xmm1 \n" 1.4984 + "movdqa %%xmm4,%%xmm6 \n" 1.4985 + "punpcklwd %%xmm3,%%xmm6 \n" 1.4986 + "punpckhwd %%xmm3,%%xmm4 \n" 1.4987 + "movdqa %%xmm1,%%xmm7 \n" 1.4988 + "punpcklwd %%xmm0,%%xmm7 \n" 1.4989 + "punpckhwd %%xmm0,%%xmm1 \n" 1.4990 + "sub $0x10,%3 \n" 1.4991 + "movdqa %%xmm6," MEMACCESS(2) " \n" 1.4992 + "movdqa %%xmm4," MEMACCESS2(0x10,2) " \n" 1.4993 + "movdqa %%xmm7," MEMACCESS2(0x20,2) " \n" 1.4994 + "movdqa %%xmm1," MEMACCESS2(0x30,2) " \n" 1.4995 + "lea " MEMLEA(0x40,2) ",%2 \n" 1.4996 + "jg 1b \n" 1.4997 + : "+r"(src_sobelx), // %0 1.4998 + "+r"(src_sobely), // %1 1.4999 + "+r"(dst_argb), // %2 1.5000 + "+r"(width) // %3 1.5001 + : 1.5002 + : "memory", "cc" 1.5003 +#if defined(__native_client__) && defined(__x86_64__) 1.5004 + , "r14" 1.5005 +#endif 1.5006 +#if defined(__SSE2__) 1.5007 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1.5008 +#endif 1.5009 + ); 1.5010 +} 1.5011 +#endif // HAS_SOBELXYROW_SSE2 1.5012 + 1.5013 +#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 1.5014 +// Creates a table of cumulative sums where each value is a sum of all values 1.5015 +// above and to the left of the value, inclusive of the value. 1.5016 +void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, 1.5017 + const int32* previous_cumsum, int width) { 1.5018 + asm volatile ( 1.5019 + "pxor %%xmm0,%%xmm0 \n" 1.5020 + "pxor %%xmm1,%%xmm1 \n" 1.5021 + "sub $0x4,%3 \n" 1.5022 + "jl 49f \n" 1.5023 + "test $0xf,%1 \n" 1.5024 + "jne 49f \n" 1.5025 + 1.5026 + // 4 pixel loop \n" 1.5027 + LABELALIGN 1.5028 + "40: \n" 1.5029 + "movdqu " MEMACCESS(0) ",%%xmm2 \n" 1.5030 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.5031 + "movdqa %%xmm2,%%xmm4 \n" 1.5032 + "punpcklbw %%xmm1,%%xmm2 \n" 1.5033 + "movdqa %%xmm2,%%xmm3 \n" 1.5034 + "punpcklwd %%xmm1,%%xmm2 \n" 1.5035 + "punpckhwd %%xmm1,%%xmm3 \n" 1.5036 + "punpckhbw %%xmm1,%%xmm4 \n" 1.5037 + "movdqa %%xmm4,%%xmm5 \n" 1.5038 + "punpcklwd %%xmm1,%%xmm4 \n" 1.5039 + "punpckhwd %%xmm1,%%xmm5 \n" 1.5040 + "paddd %%xmm2,%%xmm0 \n" 1.5041 + "movdqa " MEMACCESS(2) ",%%xmm2 \n" 1.5042 + "paddd %%xmm0,%%xmm2 \n" 1.5043 + "paddd %%xmm3,%%xmm0 \n" 1.5044 + "movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n" 1.5045 + "paddd %%xmm0,%%xmm3 \n" 1.5046 + "paddd %%xmm4,%%xmm0 \n" 1.5047 + "movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n" 1.5048 + "paddd %%xmm0,%%xmm4 \n" 1.5049 + "paddd %%xmm5,%%xmm0 \n" 1.5050 + "movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n" 1.5051 + "lea " MEMLEA(0x40,2) ",%2 \n" 1.5052 + "paddd %%xmm0,%%xmm5 \n" 1.5053 + "movdqa %%xmm2," MEMACCESS(1) " \n" 1.5054 + "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" 1.5055 + "movdqa %%xmm4," MEMACCESS2(0x20,1) " \n" 1.5056 + "movdqa %%xmm5," MEMACCESS2(0x30,1) " \n" 1.5057 + "lea " MEMLEA(0x40,1) ",%1 \n" 1.5058 + "sub $0x4,%3 \n" 1.5059 + "jge 40b \n" 1.5060 + 1.5061 + "49: \n" 1.5062 + "add $0x3,%3 \n" 1.5063 + "jl 19f \n" 1.5064 + 1.5065 + // 1 pixel loop \n" 1.5066 + LABELALIGN 1.5067 + "10: \n" 1.5068 + "movd " MEMACCESS(0) ",%%xmm2 \n" 1.5069 + "lea " MEMLEA(0x4,0) ",%0 \n" 1.5070 + "punpcklbw %%xmm1,%%xmm2 \n" 1.5071 + "punpcklwd %%xmm1,%%xmm2 \n" 1.5072 + "paddd %%xmm2,%%xmm0 \n" 1.5073 + "movdqu " MEMACCESS(2) ",%%xmm2 \n" 1.5074 + "lea " MEMLEA(0x10,2) ",%2 \n" 1.5075 + "paddd %%xmm0,%%xmm2 \n" 1.5076 + "movdqu %%xmm2," MEMACCESS(1) " \n" 1.5077 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5078 + "sub $0x1,%3 \n" 1.5079 + "jge 10b \n" 1.5080 + 1.5081 + "19: \n" 1.5082 + : "+r"(row), // %0 1.5083 + "+r"(cumsum), // %1 1.5084 + "+r"(previous_cumsum), // %2 1.5085 + "+r"(width) // %3 1.5086 + : 1.5087 + : "memory", "cc" 1.5088 +#if defined(__SSE2__) 1.5089 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.5090 +#endif 1.5091 + ); 1.5092 +} 1.5093 +#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 1.5094 + 1.5095 +#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 1.5096 +void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, 1.5097 + int width, int area, uint8* dst, 1.5098 + int count) { 1.5099 + asm volatile ( 1.5100 + "movd %5,%%xmm5 \n" 1.5101 + "cvtdq2ps %%xmm5,%%xmm5 \n" 1.5102 + "rcpss %%xmm5,%%xmm4 \n" 1.5103 + "pshufd $0x0,%%xmm4,%%xmm4 \n" 1.5104 + "sub $0x4,%3 \n" 1.5105 + "jl 49f \n" 1.5106 + "cmpl $0x80,%5 \n" 1.5107 + "ja 40f \n" 1.5108 + 1.5109 + "pshufd $0x0,%%xmm5,%%xmm5 \n" 1.5110 + "pcmpeqb %%xmm6,%%xmm6 \n" 1.5111 + "psrld $0x10,%%xmm6 \n" 1.5112 + "cvtdq2ps %%xmm6,%%xmm6 \n" 1.5113 + "addps %%xmm6,%%xmm5 \n" 1.5114 + "mulps %%xmm4,%%xmm5 \n" 1.5115 + "cvtps2dq %%xmm5,%%xmm5 \n" 1.5116 + "packssdw %%xmm5,%%xmm5 \n" 1.5117 + 1.5118 + // 4 pixel small loop \n" 1.5119 + LABELALIGN 1.5120 + "4: \n" 1.5121 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.5122 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.5123 + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.5124 + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1.5125 + BUNDLEALIGN 1.5126 + MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 1.5127 + MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 1.5128 + MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 1.5129 + MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 1.5130 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.5131 + "psubd " MEMACCESS(1) ",%%xmm0 \n" 1.5132 + "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" 1.5133 + "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" 1.5134 + "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" 1.5135 + BUNDLEALIGN 1.5136 + MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 1.5137 + MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 1.5138 + MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 1.5139 + MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 1.5140 + "lea " MEMLEA(0x40,1) ",%1 \n" 1.5141 + "packssdw %%xmm1,%%xmm0 \n" 1.5142 + "packssdw %%xmm3,%%xmm2 \n" 1.5143 + "pmulhuw %%xmm5,%%xmm0 \n" 1.5144 + "pmulhuw %%xmm5,%%xmm2 \n" 1.5145 + "packuswb %%xmm2,%%xmm0 \n" 1.5146 + "movdqu %%xmm0," MEMACCESS(2) " \n" 1.5147 + "lea " MEMLEA(0x10,2) ",%2 \n" 1.5148 + "sub $0x4,%3 \n" 1.5149 + "jge 4b \n" 1.5150 + "jmp 49f \n" 1.5151 + 1.5152 + // 4 pixel loop \n" 1.5153 + LABELALIGN 1.5154 + "40: \n" 1.5155 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.5156 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.5157 + "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" 1.5158 + "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" 1.5159 + BUNDLEALIGN 1.5160 + MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 1.5161 + MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 1.5162 + MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 1.5163 + MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 1.5164 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.5165 + "psubd " MEMACCESS(1) ",%%xmm0 \n" 1.5166 + "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" 1.5167 + "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" 1.5168 + "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" 1.5169 + BUNDLEALIGN 1.5170 + MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 1.5171 + MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 1.5172 + MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 1.5173 + MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 1.5174 + "lea " MEMLEA(0x40,1) ",%1 \n" 1.5175 + "cvtdq2ps %%xmm0,%%xmm0 \n" 1.5176 + "cvtdq2ps %%xmm1,%%xmm1 \n" 1.5177 + "mulps %%xmm4,%%xmm0 \n" 1.5178 + "mulps %%xmm4,%%xmm1 \n" 1.5179 + "cvtdq2ps %%xmm2,%%xmm2 \n" 1.5180 + "cvtdq2ps %%xmm3,%%xmm3 \n" 1.5181 + "mulps %%xmm4,%%xmm2 \n" 1.5182 + "mulps %%xmm4,%%xmm3 \n" 1.5183 + "cvtps2dq %%xmm0,%%xmm0 \n" 1.5184 + "cvtps2dq %%xmm1,%%xmm1 \n" 1.5185 + "cvtps2dq %%xmm2,%%xmm2 \n" 1.5186 + "cvtps2dq %%xmm3,%%xmm3 \n" 1.5187 + "packssdw %%xmm1,%%xmm0 \n" 1.5188 + "packssdw %%xmm3,%%xmm2 \n" 1.5189 + "packuswb %%xmm2,%%xmm0 \n" 1.5190 + "movdqu %%xmm0," MEMACCESS(2) " \n" 1.5191 + "lea " MEMLEA(0x10,2) ",%2 \n" 1.5192 + "sub $0x4,%3 \n" 1.5193 + "jge 40b \n" 1.5194 + 1.5195 + "49: \n" 1.5196 + "add $0x3,%3 \n" 1.5197 + "jl 19f \n" 1.5198 + 1.5199 + // 1 pixel loop \n" 1.5200 + LABELALIGN 1.5201 + "10: \n" 1.5202 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.5203 + MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 1.5204 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.5205 + "psubd " MEMACCESS(1) ",%%xmm0 \n" 1.5206 + BUNDLEALIGN 1.5207 + MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 1.5208 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5209 + "cvtdq2ps %%xmm0,%%xmm0 \n" 1.5210 + "mulps %%xmm4,%%xmm0 \n" 1.5211 + "cvtps2dq %%xmm0,%%xmm0 \n" 1.5212 + "packssdw %%xmm0,%%xmm0 \n" 1.5213 + "packuswb %%xmm0,%%xmm0 \n" 1.5214 + "movd %%xmm0," MEMACCESS(2) " \n" 1.5215 + "lea " MEMLEA(0x4,2) ",%2 \n" 1.5216 + "sub $0x1,%3 \n" 1.5217 + "jge 10b \n" 1.5218 + "19: \n" 1.5219 + : "+r"(topleft), // %0 1.5220 + "+r"(botleft), // %1 1.5221 + "+r"(dst), // %2 1.5222 + "+rm"(count) // %3 1.5223 + : "r"((intptr_t)(width)), // %4 1.5224 + "rm"(area) // %5 1.5225 + : "memory", "cc" 1.5226 +#if defined(__native_client__) && defined(__x86_64__) 1.5227 + , "r14" 1.5228 +#endif 1.5229 +#if defined(__SSE2__) 1.5230 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 1.5231 +#endif 1.5232 + ); 1.5233 +} 1.5234 +#endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 1.5235 + 1.5236 +#ifdef HAS_ARGBAFFINEROW_SSE2 1.5237 +// Copy ARGB pixels from source image with slope to a row of destination. 1.5238 +LIBYUV_API 1.5239 +void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, 1.5240 + uint8* dst_argb, const float* src_dudv, int width) { 1.5241 + intptr_t src_argb_stride_temp = src_argb_stride; 1.5242 + intptr_t temp = 0; 1.5243 + asm volatile ( 1.5244 + "movq " MEMACCESS(3) ",%%xmm2 \n" 1.5245 + "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" 1.5246 + "shl $0x10,%1 \n" 1.5247 + "add $0x4,%1 \n" 1.5248 + "movd %1,%%xmm5 \n" 1.5249 + "sub $0x4,%4 \n" 1.5250 + "jl 49f \n" 1.5251 + 1.5252 + "pshufd $0x44,%%xmm7,%%xmm7 \n" 1.5253 + "pshufd $0x0,%%xmm5,%%xmm5 \n" 1.5254 + "movdqa %%xmm2,%%xmm0 \n" 1.5255 + "addps %%xmm7,%%xmm0 \n" 1.5256 + "movlhps %%xmm0,%%xmm2 \n" 1.5257 + "movdqa %%xmm7,%%xmm4 \n" 1.5258 + "addps %%xmm4,%%xmm4 \n" 1.5259 + "movdqa %%xmm2,%%xmm3 \n" 1.5260 + "addps %%xmm4,%%xmm3 \n" 1.5261 + "addps %%xmm4,%%xmm4 \n" 1.5262 + 1.5263 + // 4 pixel loop \n" 1.5264 + LABELALIGN 1.5265 + "40: \n" 1.5266 + "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2 1.5267 + "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2 1.5268 + "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts 1.5269 + "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride 1.5270 + "movd %%xmm0,%k1 \n" 1.5271 + "pshufd $0x39,%%xmm0,%%xmm0 \n" 1.5272 + "movd %%xmm0,%k5 \n" 1.5273 + "pshufd $0x39,%%xmm0,%%xmm0 \n" 1.5274 + BUNDLEALIGN 1.5275 + MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 1.5276 + MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 1.5277 + "punpckldq %%xmm6,%%xmm1 \n" 1.5278 + "addps %%xmm4,%%xmm2 \n" 1.5279 + "movq %%xmm1," MEMACCESS(2) " \n" 1.5280 + "movd %%xmm0,%k1 \n" 1.5281 + "pshufd $0x39,%%xmm0,%%xmm0 \n" 1.5282 + "movd %%xmm0,%k5 \n" 1.5283 + BUNDLEALIGN 1.5284 + MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 1.5285 + MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 1.5286 + "punpckldq %%xmm6,%%xmm0 \n" 1.5287 + "addps %%xmm4,%%xmm3 \n" 1.5288 + "sub $0x4,%4 \n" 1.5289 + "movq %%xmm0," MEMACCESS2(0x08,2) " \n" 1.5290 + "lea " MEMLEA(0x10,2) ",%2 \n" 1.5291 + "jge 40b \n" 1.5292 + 1.5293 + "49: \n" 1.5294 + "add $0x3,%4 \n" 1.5295 + "jl 19f \n" 1.5296 + 1.5297 + // 1 pixel loop \n" 1.5298 + LABELALIGN 1.5299 + "10: \n" 1.5300 + "cvttps2dq %%xmm2,%%xmm0 \n" 1.5301 + "packssdw %%xmm0,%%xmm0 \n" 1.5302 + "pmaddwd %%xmm5,%%xmm0 \n" 1.5303 + "addps %%xmm7,%%xmm2 \n" 1.5304 + "movd %%xmm0,%k1 \n" 1.5305 + BUNDLEALIGN 1.5306 + MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 1.5307 + "sub $0x1,%4 \n" 1.5308 + "movd %%xmm0," MEMACCESS(2) " \n" 1.5309 + "lea " MEMLEA(0x04,2) ",%2 \n" 1.5310 + "jge 10b \n" 1.5311 + "19: \n" 1.5312 + : "+r"(src_argb), // %0 1.5313 + "+r"(src_argb_stride_temp), // %1 1.5314 + "+r"(dst_argb), // %2 1.5315 + "+r"(src_dudv), // %3 1.5316 + "+rm"(width), // %4 1.5317 + "+r"(temp) // %5 1.5318 + : 1.5319 + : "memory", "cc" 1.5320 +#if defined(__native_client__) && defined(__x86_64__) 1.5321 + , "r14" 1.5322 +#endif 1.5323 +#if defined(__SSE2__) 1.5324 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1.5325 +#endif 1.5326 + ); 1.5327 +} 1.5328 +#endif // HAS_ARGBAFFINEROW_SSE2 1.5329 + 1.5330 +#ifdef HAS_INTERPOLATEROW_SSSE3 1.5331 +// Bilinear filter 16x2 -> 16x1 1.5332 +void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 1.5333 + ptrdiff_t src_stride, int dst_width, 1.5334 + int source_y_fraction) { 1.5335 + asm volatile ( 1.5336 + "sub %1,%0 \n" 1.5337 + "shr %3 \n" 1.5338 + "cmp $0x0,%3 \n" 1.5339 + "je 100f \n" 1.5340 + "cmp $0x20,%3 \n" 1.5341 + "je 75f \n" 1.5342 + "cmp $0x40,%3 \n" 1.5343 + "je 50f \n" 1.5344 + "cmp $0x60,%3 \n" 1.5345 + "je 25f \n" 1.5346 + 1.5347 + "movd %3,%%xmm0 \n" 1.5348 + "neg %3 \n" 1.5349 + "add $0x80,%3 \n" 1.5350 + "movd %3,%%xmm5 \n" 1.5351 + "punpcklbw %%xmm0,%%xmm5 \n" 1.5352 + "punpcklwd %%xmm5,%%xmm5 \n" 1.5353 + "pshufd $0x0,%%xmm5,%%xmm5 \n" 1.5354 + 1.5355 + // General purpose row blend. 1.5356 + LABELALIGN 1.5357 + "1: \n" 1.5358 + "movdqa " MEMACCESS(1) ",%%xmm0 \n" 1.5359 + MEMOPREG(movdqa,0x00,1,4,1,xmm2) 1.5360 + "movdqa %%xmm0,%%xmm1 \n" 1.5361 + "punpcklbw %%xmm2,%%xmm0 \n" 1.5362 + "punpckhbw %%xmm2,%%xmm1 \n" 1.5363 + "pmaddubsw %%xmm5,%%xmm0 \n" 1.5364 + "pmaddubsw %%xmm5,%%xmm1 \n" 1.5365 + "psrlw $0x7,%%xmm0 \n" 1.5366 + "psrlw $0x7,%%xmm1 \n" 1.5367 + "packuswb %%xmm1,%%xmm0 \n" 1.5368 + "sub $0x10,%2 \n" 1.5369 + BUNDLEALIGN 1.5370 + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) 1.5371 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5372 + "jg 1b \n" 1.5373 + "jmp 99f \n" 1.5374 + 1.5375 + // Blend 25 / 75. 1.5376 + LABELALIGN 1.5377 + "25: \n" 1.5378 + "movdqa " MEMACCESS(1) ",%%xmm0 \n" 1.5379 + MEMOPREG(movdqa,0x00,1,4,1,xmm1) 1.5380 + "pavgb %%xmm1,%%xmm0 \n" 1.5381 + "pavgb %%xmm1,%%xmm0 \n" 1.5382 + "sub $0x10,%2 \n" 1.5383 + BUNDLEALIGN 1.5384 + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) 1.5385 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5386 + "jg 25b \n" 1.5387 + "jmp 99f \n" 1.5388 + 1.5389 + // Blend 50 / 50. 1.5390 + LABELALIGN 1.5391 + "50: \n" 1.5392 + "movdqa " MEMACCESS(1) ",%%xmm0 \n" 1.5393 + MEMOPREG(movdqa,0x00,1,4,1,xmm1) 1.5394 + "pavgb %%xmm1,%%xmm0 \n" 1.5395 + "sub $0x10,%2 \n" 1.5396 + BUNDLEALIGN 1.5397 + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) 1.5398 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5399 + "jg 50b \n" 1.5400 + "jmp 99f \n" 1.5401 + 1.5402 + // Blend 75 / 25. 1.5403 + LABELALIGN 1.5404 + "75: \n" 1.5405 + "movdqa " MEMACCESS(1) ",%%xmm1 \n" 1.5406 + MEMOPREG(movdqa,0x00,1,4,1,xmm0) 1.5407 + "pavgb %%xmm1,%%xmm0 \n" 1.5408 + "pavgb %%xmm1,%%xmm0 \n" 1.5409 + "sub $0x10,%2 \n" 1.5410 + BUNDLEALIGN 1.5411 + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) 1.5412 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5413 + "jg 75b \n" 1.5414 + "jmp 99f \n" 1.5415 + 1.5416 + // Blend 100 / 0 - Copy row unchanged. 1.5417 + LABELALIGN 1.5418 + "100: \n" 1.5419 + "movdqa " MEMACCESS(1) ",%%xmm0 \n" 1.5420 + "sub $0x10,%2 \n" 1.5421 + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) 1.5422 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5423 + "jg 100b \n" 1.5424 + 1.5425 + "99: \n" 1.5426 + : "+r"(dst_ptr), // %0 1.5427 + "+r"(src_ptr), // %1 1.5428 + "+r"(dst_width), // %2 1.5429 + "+r"(source_y_fraction) // %3 1.5430 + : "r"((intptr_t)(src_stride)) // %4 1.5431 + : "memory", "cc" 1.5432 +#if defined(__native_client__) && defined(__x86_64__) 1.5433 + , "r14" 1.5434 +#endif 1.5435 +#if defined(__SSE2__) 1.5436 + , "xmm0", "xmm1", "xmm2", "xmm5" 1.5437 +#endif 1.5438 + ); 1.5439 +} 1.5440 +#endif // HAS_INTERPOLATEROW_SSSE3 1.5441 + 1.5442 +#ifdef HAS_INTERPOLATEROW_SSE2 1.5443 +// Bilinear filter 16x2 -> 16x1 1.5444 +void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, 1.5445 + ptrdiff_t src_stride, int dst_width, 1.5446 + int source_y_fraction) { 1.5447 + asm volatile ( 1.5448 + "sub %1,%0 \n" 1.5449 + "shr %3 \n" 1.5450 + "cmp $0x0,%3 \n" 1.5451 + "je 100f \n" 1.5452 + "cmp $0x20,%3 \n" 1.5453 + "je 75f \n" 1.5454 + "cmp $0x40,%3 \n" 1.5455 + "je 50f \n" 1.5456 + "cmp $0x60,%3 \n" 1.5457 + "je 25f \n" 1.5458 + 1.5459 + "movd %3,%%xmm0 \n" 1.5460 + "neg %3 \n" 1.5461 + "add $0x80,%3 \n" 1.5462 + "movd %3,%%xmm5 \n" 1.5463 + "punpcklbw %%xmm0,%%xmm5 \n" 1.5464 + "punpcklwd %%xmm5,%%xmm5 \n" 1.5465 + "pshufd $0x0,%%xmm5,%%xmm5 \n" 1.5466 + "pxor %%xmm4,%%xmm4 \n" 1.5467 + 1.5468 + // General purpose row blend. 1.5469 + LABELALIGN 1.5470 + "1: \n" 1.5471 + "movdqa " MEMACCESS(1) ",%%xmm0 \n" 1.5472 + MEMOPREG(movdqa,0x00,1,4,1,xmm2) // movdqa (%1,%4,1),%%xmm2 1.5473 + "movdqa %%xmm0,%%xmm1 \n" 1.5474 + "movdqa %%xmm2,%%xmm3 \n" 1.5475 + "punpcklbw %%xmm4,%%xmm2 \n" 1.5476 + "punpckhbw %%xmm4,%%xmm3 \n" 1.5477 + "punpcklbw %%xmm4,%%xmm0 \n" 1.5478 + "punpckhbw %%xmm4,%%xmm1 \n" 1.5479 + "psubw %%xmm0,%%xmm2 \n" 1.5480 + "psubw %%xmm1,%%xmm3 \n" 1.5481 + "paddw %%xmm2,%%xmm2 \n" 1.5482 + "paddw %%xmm3,%%xmm3 \n" 1.5483 + "pmulhw %%xmm5,%%xmm2 \n" 1.5484 + "pmulhw %%xmm5,%%xmm3 \n" 1.5485 + "paddw %%xmm2,%%xmm0 \n" 1.5486 + "paddw %%xmm3,%%xmm1 \n" 1.5487 + "packuswb %%xmm1,%%xmm0 \n" 1.5488 + "sub $0x10,%2 \n" 1.5489 + BUNDLEALIGN 1.5490 + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) 1.5491 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5492 + "jg 1b \n" 1.5493 + "jmp 99f \n" 1.5494 + 1.5495 + // Blend 25 / 75. 1.5496 + LABELALIGN 1.5497 + "25: \n" 1.5498 + "movdqa " MEMACCESS(1) ",%%xmm0 \n" 1.5499 + MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1 1.5500 + "pavgb %%xmm1,%%xmm0 \n" 1.5501 + "pavgb %%xmm1,%%xmm0 \n" 1.5502 + "sub $0x10,%2 \n" 1.5503 + BUNDLEALIGN 1.5504 + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) 1.5505 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5506 + "jg 25b \n" 1.5507 + "jmp 99f \n" 1.5508 + 1.5509 + // Blend 50 / 50. 1.5510 + LABELALIGN 1.5511 + "50: \n" 1.5512 + "movdqa " MEMACCESS(1) ",%%xmm0 \n" 1.5513 + MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1 1.5514 + "pavgb %%xmm1,%%xmm0 \n" 1.5515 + "sub $0x10,%2 \n" 1.5516 + BUNDLEALIGN 1.5517 + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) 1.5518 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5519 + "jg 50b \n" 1.5520 + "jmp 99f \n" 1.5521 + 1.5522 + // Blend 75 / 25. 1.5523 + LABELALIGN 1.5524 + "75: \n" 1.5525 + "movdqa " MEMACCESS(1) ",%%xmm1 \n" 1.5526 + MEMOPREG(movdqa,0x00,1,4,1,xmm0) // movdqa (%1,%4,1),%%xmm0 1.5527 + "pavgb %%xmm1,%%xmm0 \n" 1.5528 + "pavgb %%xmm1,%%xmm0 \n" 1.5529 + "sub $0x10,%2 \n" 1.5530 + BUNDLEALIGN 1.5531 + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) 1.5532 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5533 + "jg 75b \n" 1.5534 + "jmp 99f \n" 1.5535 + 1.5536 + // Blend 100 / 0 - Copy row unchanged. 1.5537 + LABELALIGN 1.5538 + "100: \n" 1.5539 + "movdqa " MEMACCESS(1) ",%%xmm0 \n" 1.5540 + "sub $0x10,%2 \n" 1.5541 + MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) 1.5542 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5543 + "jg 100b \n" 1.5544 + 1.5545 + "99: \n" 1.5546 + : "+r"(dst_ptr), // %0 1.5547 + "+r"(src_ptr), // %1 1.5548 + "+r"(dst_width), // %2 1.5549 + "+r"(source_y_fraction) // %3 1.5550 + : "r"((intptr_t)(src_stride)) // %4 1.5551 + : "memory", "cc" 1.5552 +#if defined(__native_client__) && defined(__x86_64__) 1.5553 + , "r14" 1.5554 +#endif 1.5555 +#if defined(__SSE2__) 1.5556 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.5557 +#endif 1.5558 + ); 1.5559 +} 1.5560 +#endif // HAS_INTERPOLATEROW_SSE2 1.5561 + 1.5562 +#ifdef HAS_INTERPOLATEROW_SSSE3 1.5563 +// Bilinear filter 16x2 -> 16x1 1.5564 +void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 1.5565 + ptrdiff_t src_stride, int dst_width, 1.5566 + int source_y_fraction) { 1.5567 + asm volatile ( 1.5568 + "sub %1,%0 \n" 1.5569 + "shr %3 \n" 1.5570 + "cmp $0x0,%3 \n" 1.5571 + "je 100f \n" 1.5572 + "cmp $0x20,%3 \n" 1.5573 + "je 75f \n" 1.5574 + "cmp $0x40,%3 \n" 1.5575 + "je 50f \n" 1.5576 + "cmp $0x60,%3 \n" 1.5577 + "je 25f \n" 1.5578 + 1.5579 + "movd %3,%%xmm0 \n" 1.5580 + "neg %3 \n" 1.5581 + "add $0x80,%3 \n" 1.5582 + "movd %3,%%xmm5 \n" 1.5583 + "punpcklbw %%xmm0,%%xmm5 \n" 1.5584 + "punpcklwd %%xmm5,%%xmm5 \n" 1.5585 + "pshufd $0x0,%%xmm5,%%xmm5 \n" 1.5586 + 1.5587 + // General purpose row blend. 1.5588 + LABELALIGN 1.5589 + "1: \n" 1.5590 + "movdqu " MEMACCESS(1) ",%%xmm0 \n" 1.5591 + MEMOPREG(movdqu,0x00,1,4,1,xmm2) 1.5592 + "movdqu %%xmm0,%%xmm1 \n" 1.5593 + "punpcklbw %%xmm2,%%xmm0 \n" 1.5594 + "punpckhbw %%xmm2,%%xmm1 \n" 1.5595 + "pmaddubsw %%xmm5,%%xmm0 \n" 1.5596 + "pmaddubsw %%xmm5,%%xmm1 \n" 1.5597 + "psrlw $0x7,%%xmm0 \n" 1.5598 + "psrlw $0x7,%%xmm1 \n" 1.5599 + "packuswb %%xmm1,%%xmm0 \n" 1.5600 + "sub $0x10,%2 \n" 1.5601 + BUNDLEALIGN 1.5602 + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) 1.5603 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5604 + "jg 1b \n" 1.5605 + "jmp 99f \n" 1.5606 + 1.5607 + // Blend 25 / 75. 1.5608 + LABELALIGN 1.5609 + "25: \n" 1.5610 + "movdqu " MEMACCESS(1) ",%%xmm0 \n" 1.5611 + MEMOPREG(movdqu,0x00,1,4,1,xmm1) 1.5612 + "pavgb %%xmm1,%%xmm0 \n" 1.5613 + "pavgb %%xmm1,%%xmm0 \n" 1.5614 + "sub $0x10,%2 \n" 1.5615 + BUNDLEALIGN 1.5616 + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) 1.5617 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5618 + "jg 25b \n" 1.5619 + "jmp 99f \n" 1.5620 + 1.5621 + // Blend 50 / 50. 1.5622 + LABELALIGN 1.5623 + "50: \n" 1.5624 + "movdqu " MEMACCESS(1) ",%%xmm0 \n" 1.5625 + MEMOPREG(movdqu,0x00,1,4,1,xmm1) 1.5626 + "pavgb %%xmm1,%%xmm0 \n" 1.5627 + "sub $0x10,%2 \n" 1.5628 + BUNDLEALIGN 1.5629 + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) 1.5630 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5631 + "jg 50b \n" 1.5632 + "jmp 99f \n" 1.5633 + 1.5634 + // Blend 75 / 25. 1.5635 + LABELALIGN 1.5636 + "75: \n" 1.5637 + "movdqu " MEMACCESS(1) ",%%xmm1 \n" 1.5638 + MEMOPREG(movdqu,0x00,1,4,1,xmm0) 1.5639 + "pavgb %%xmm1,%%xmm0 \n" 1.5640 + "pavgb %%xmm1,%%xmm0 \n" 1.5641 + "sub $0x10,%2 \n" 1.5642 + BUNDLEALIGN 1.5643 + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) 1.5644 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5645 + "jg 75b \n" 1.5646 + "jmp 99f \n" 1.5647 + 1.5648 + // Blend 100 / 0 - Copy row unchanged. 1.5649 + LABELALIGN 1.5650 + "100: \n" 1.5651 + "movdqu " MEMACCESS(1) ",%%xmm0 \n" 1.5652 + "sub $0x10,%2 \n" 1.5653 + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) 1.5654 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5655 + "jg 100b \n" 1.5656 + 1.5657 + "99: \n" 1.5658 + : "+r"(dst_ptr), // %0 1.5659 + "+r"(src_ptr), // %1 1.5660 + "+r"(dst_width), // %2 1.5661 + "+r"(source_y_fraction) // %3 1.5662 + : "r"((intptr_t)(src_stride)) // %4 1.5663 + : "memory", "cc" 1.5664 +#if defined(__native_client__) && defined(__x86_64__) 1.5665 + , "r14" 1.5666 +#endif 1.5667 +#if defined(__SSE2__) 1.5668 + , "xmm0", "xmm1", "xmm2", "xmm5" 1.5669 +#endif 1.5670 + ); 1.5671 +} 1.5672 +#endif // HAS_INTERPOLATEROW_SSSE3 1.5673 + 1.5674 +#ifdef HAS_INTERPOLATEROW_SSE2 1.5675 +// Bilinear filter 16x2 -> 16x1 1.5676 +void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, 1.5677 + ptrdiff_t src_stride, int dst_width, 1.5678 + int source_y_fraction) { 1.5679 + asm volatile ( 1.5680 + "sub %1,%0 \n" 1.5681 + "shr %3 \n" 1.5682 + "cmp $0x0,%3 \n" 1.5683 + "je 100f \n" 1.5684 + "cmp $0x20,%3 \n" 1.5685 + "je 75f \n" 1.5686 + "cmp $0x40,%3 \n" 1.5687 + "je 50f \n" 1.5688 + "cmp $0x60,%3 \n" 1.5689 + "je 25f \n" 1.5690 + 1.5691 + "movd %3,%%xmm0 \n" 1.5692 + "neg %3 \n" 1.5693 + "add $0x80,%3 \n" 1.5694 + "movd %3,%%xmm5 \n" 1.5695 + "punpcklbw %%xmm0,%%xmm5 \n" 1.5696 + "punpcklwd %%xmm5,%%xmm5 \n" 1.5697 + "pshufd $0x0,%%xmm5,%%xmm5 \n" 1.5698 + "pxor %%xmm4,%%xmm4 \n" 1.5699 + 1.5700 + // General purpose row blend. 1.5701 + LABELALIGN 1.5702 + "1: \n" 1.5703 + "movdqu " MEMACCESS(1) ",%%xmm0 \n" 1.5704 + MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2 1.5705 + "movdqu %%xmm0,%%xmm1 \n" 1.5706 + "movdqu %%xmm2,%%xmm3 \n" 1.5707 + "punpcklbw %%xmm4,%%xmm2 \n" 1.5708 + "punpckhbw %%xmm4,%%xmm3 \n" 1.5709 + "punpcklbw %%xmm4,%%xmm0 \n" 1.5710 + "punpckhbw %%xmm4,%%xmm1 \n" 1.5711 + "psubw %%xmm0,%%xmm2 \n" 1.5712 + "psubw %%xmm1,%%xmm3 \n" 1.5713 + "paddw %%xmm2,%%xmm2 \n" 1.5714 + "paddw %%xmm3,%%xmm3 \n" 1.5715 + "pmulhw %%xmm5,%%xmm2 \n" 1.5716 + "pmulhw %%xmm5,%%xmm3 \n" 1.5717 + "paddw %%xmm2,%%xmm0 \n" 1.5718 + "paddw %%xmm3,%%xmm1 \n" 1.5719 + "packuswb %%xmm1,%%xmm0 \n" 1.5720 + "sub $0x10,%2 \n" 1.5721 + BUNDLEALIGN 1.5722 + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) 1.5723 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5724 + "jg 1b \n" 1.5725 + "jmp 99f \n" 1.5726 + 1.5727 + // Blend 25 / 75. 1.5728 + LABELALIGN 1.5729 + "25: \n" 1.5730 + "movdqu " MEMACCESS(1) ",%%xmm0 \n" 1.5731 + MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 1.5732 + "pavgb %%xmm1,%%xmm0 \n" 1.5733 + "pavgb %%xmm1,%%xmm0 \n" 1.5734 + "sub $0x10,%2 \n" 1.5735 + BUNDLEALIGN 1.5736 + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) 1.5737 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5738 + "jg 25b \n" 1.5739 + "jmp 99f \n" 1.5740 + 1.5741 + // Blend 50 / 50. 1.5742 + LABELALIGN 1.5743 + "50: \n" 1.5744 + "movdqu " MEMACCESS(1) ",%%xmm0 \n" 1.5745 + MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 1.5746 + "pavgb %%xmm1,%%xmm0 \n" 1.5747 + "sub $0x10,%2 \n" 1.5748 + BUNDLEALIGN 1.5749 + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) 1.5750 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5751 + "jg 50b \n" 1.5752 + "jmp 99f \n" 1.5753 + 1.5754 + // Blend 75 / 25. 1.5755 + LABELALIGN 1.5756 + "75: \n" 1.5757 + "movdqu " MEMACCESS(1) ",%%xmm1 \n" 1.5758 + MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0 1.5759 + "pavgb %%xmm1,%%xmm0 \n" 1.5760 + "pavgb %%xmm1,%%xmm0 \n" 1.5761 + "sub $0x10,%2 \n" 1.5762 + BUNDLEALIGN 1.5763 + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) 1.5764 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5765 + "jg 75b \n" 1.5766 + "jmp 99f \n" 1.5767 + 1.5768 + // Blend 100 / 0 - Copy row unchanged. 1.5769 + LABELALIGN 1.5770 + "100: \n" 1.5771 + "movdqu " MEMACCESS(1) ",%%xmm0 \n" 1.5772 + "sub $0x10,%2 \n" 1.5773 + MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) 1.5774 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.5775 + "jg 100b \n" 1.5776 + 1.5777 + "99: \n" 1.5778 + : "+r"(dst_ptr), // %0 1.5779 + "+r"(src_ptr), // %1 1.5780 + "+r"(dst_width), // %2 1.5781 + "+r"(source_y_fraction) // %3 1.5782 + : "r"((intptr_t)(src_stride)) // %4 1.5783 + : "memory", "cc" 1.5784 +#if defined(__native_client__) && defined(__x86_64__) 1.5785 + , "r14" 1.5786 +#endif 1.5787 +#if defined(__SSE2__) 1.5788 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.5789 +#endif 1.5790 + ); 1.5791 +} 1.5792 +#endif // HAS_INTERPOLATEROW_SSE2 1.5793 + 1.5794 +#ifdef HAS_HALFROW_SSE2 1.5795 +void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, 1.5796 + uint8* dst_uv, int pix) { 1.5797 + asm volatile ( 1.5798 + "sub %0,%1 \n" 1.5799 + LABELALIGN 1.5800 + "1: \n" 1.5801 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.5802 + MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3),%%xmm0 1.5803 + "sub $0x10,%2 \n" 1.5804 + MEMOPMEM(movdqa,xmm0,0x00,0,1,1) // movdqa %%xmm0,(%0,%1) 1.5805 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.5806 + "jg 1b \n" 1.5807 + : "+r"(src_uv), // %0 1.5808 + "+r"(dst_uv), // %1 1.5809 + "+r"(pix) // %2 1.5810 + : "r"((intptr_t)(src_uv_stride)) // %3 1.5811 + : "memory", "cc" 1.5812 +#if defined(__SSE2__) 1.5813 + , "xmm0" 1.5814 +#endif 1.5815 + ); 1.5816 +} 1.5817 +#endif // HAS_HALFROW_SSE2 1.5818 + 1.5819 +#ifdef HAS_ARGBTOBAYERROW_SSSE3 1.5820 +void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, 1.5821 + uint32 selector, int pix) { 1.5822 + asm volatile ( 1.5823 + // NaCL caveat - assumes movd is from GPR 1.5824 + "movd %3,%%xmm5 \n" 1.5825 + "pshufd $0x0,%%xmm5,%%xmm5 \n" 1.5826 + LABELALIGN 1.5827 + "1: \n" 1.5828 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.5829 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.5830 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.5831 + "pshufb %%xmm5,%%xmm0 \n" 1.5832 + "pshufb %%xmm5,%%xmm1 \n" 1.5833 + "punpckldq %%xmm1,%%xmm0 \n" 1.5834 + "sub $0x8,%2 \n" 1.5835 + "movq %%xmm0," MEMACCESS(1) " \n" 1.5836 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.5837 + "jg 1b \n" 1.5838 + : "+r"(src_argb), // %0 1.5839 + "+r"(dst_bayer), // %1 1.5840 + "+r"(pix) // %2 1.5841 + : "g"(selector) // %3 1.5842 + : "memory", "cc" 1.5843 +#if defined(__SSE2__) 1.5844 + , "xmm0", "xmm1", "xmm5" 1.5845 +#endif 1.5846 + ); 1.5847 +} 1.5848 +#endif // HAS_ARGBTOBAYERROW_SSSE3 1.5849 + 1.5850 +#ifdef HAS_ARGBTOBAYERGGROW_SSE2 1.5851 +void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, 1.5852 + uint32 selector, int pix) { 1.5853 + asm volatile ( 1.5854 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.5855 + "psrld $0x18,%%xmm5 \n" 1.5856 + LABELALIGN 1.5857 + "1: \n" 1.5858 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.5859 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.5860 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.5861 + "psrld $0x8,%%xmm0 \n" 1.5862 + "psrld $0x8,%%xmm1 \n" 1.5863 + "pand %%xmm5,%%xmm0 \n" 1.5864 + "pand %%xmm5,%%xmm1 \n" 1.5865 + "packssdw %%xmm1,%%xmm0 \n" 1.5866 + "packuswb %%xmm1,%%xmm0 \n" 1.5867 + "sub $0x8,%2 \n" 1.5868 + "movq %%xmm0," MEMACCESS(1) " \n" 1.5869 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.5870 + "jg 1b \n" 1.5871 + : "+r"(src_argb), // %0 1.5872 + "+r"(dst_bayer), // %1 1.5873 + "+r"(pix) // %2 1.5874 + : 1.5875 + : "memory", "cc" 1.5876 +#if defined(__SSE2__) 1.5877 + , "xmm0", "xmm1", "xmm5" 1.5878 +#endif 1.5879 + ); 1.5880 +} 1.5881 +#endif // HAS_ARGBTOBAYERGGROW_SSE2 1.5882 + 1.5883 +#ifdef HAS_ARGBSHUFFLEROW_SSSE3 1.5884 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 1.5885 +void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 1.5886 + const uint8* shuffler, int pix) { 1.5887 + asm volatile ( 1.5888 + "movdqa " MEMACCESS(3) ",%%xmm5 \n" 1.5889 + LABELALIGN 1.5890 + "1: \n" 1.5891 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.5892 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.5893 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.5894 + "pshufb %%xmm5,%%xmm0 \n" 1.5895 + "pshufb %%xmm5,%%xmm1 \n" 1.5896 + "sub $0x8,%2 \n" 1.5897 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.5898 + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" 1.5899 + "lea " MEMLEA(0x20,1) ",%1 \n" 1.5900 + "jg 1b \n" 1.5901 + : "+r"(src_argb), // %0 1.5902 + "+r"(dst_argb), // %1 1.5903 + "+r"(pix) // %2 1.5904 + : "r"(shuffler) // %3 1.5905 + : "memory", "cc" 1.5906 +#if defined(__SSE2__) 1.5907 + , "xmm0", "xmm1", "xmm5" 1.5908 +#endif 1.5909 + ); 1.5910 +} 1.5911 + 1.5912 +void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, 1.5913 + const uint8* shuffler, int pix) { 1.5914 + asm volatile ( 1.5915 + "movdqa " MEMACCESS(3) ",%%xmm5 \n" 1.5916 + LABELALIGN 1.5917 + "1: \n" 1.5918 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.5919 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.5920 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.5921 + "pshufb %%xmm5,%%xmm0 \n" 1.5922 + "pshufb %%xmm5,%%xmm1 \n" 1.5923 + "sub $0x8,%2 \n" 1.5924 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.5925 + "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" 1.5926 + "lea " MEMLEA(0x20,1) ",%1 \n" 1.5927 + "jg 1b \n" 1.5928 + : "+r"(src_argb), // %0 1.5929 + "+r"(dst_argb), // %1 1.5930 + "+r"(pix) // %2 1.5931 + : "r"(shuffler) // %3 1.5932 + : "memory", "cc" 1.5933 +#if defined(__SSE2__) 1.5934 + , "xmm0", "xmm1", "xmm5" 1.5935 +#endif 1.5936 + ); 1.5937 +} 1.5938 +#endif // HAS_ARGBSHUFFLEROW_SSSE3 1.5939 + 1.5940 +#ifdef HAS_ARGBSHUFFLEROW_AVX2 1.5941 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 1.5942 +void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, 1.5943 + const uint8* shuffler, int pix) { 1.5944 + asm volatile ( 1.5945 + "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" 1.5946 + LABELALIGN 1.5947 + "1: \n" 1.5948 + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" 1.5949 + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" 1.5950 + "lea " MEMLEA(0x40,0) ",%0 \n" 1.5951 + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" 1.5952 + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" 1.5953 + "sub $0x10,%2 \n" 1.5954 + "vmovdqu %%ymm0," MEMACCESS(1) " \n" 1.5955 + "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" 1.5956 + "lea " MEMLEA(0x40,1) ",%1 \n" 1.5957 + "jg 1b \n" 1.5958 + : "+r"(src_argb), // %0 1.5959 + "+r"(dst_argb), // %1 1.5960 + "+r"(pix) // %2 1.5961 + : "r"(shuffler) // %3 1.5962 + : "memory", "cc" 1.5963 +#if defined(__SSE2__) 1.5964 + , "xmm0", "xmm1", "xmm5" 1.5965 +#endif 1.5966 + ); 1.5967 +} 1.5968 +#endif // HAS_ARGBSHUFFLEROW_AVX2 1.5969 + 1.5970 +#ifdef HAS_ARGBSHUFFLEROW_SSE2 1.5971 +// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. 1.5972 +void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, 1.5973 + const uint8* shuffler, int pix) { 1.5974 + uintptr_t pixel_temp = 0u; 1.5975 + asm volatile ( 1.5976 + "pxor %%xmm5,%%xmm5 \n" 1.5977 + "mov " MEMACCESS(4) ",%k2 \n" 1.5978 + "cmp $0x3000102,%k2 \n" 1.5979 + "je 3012f \n" 1.5980 + "cmp $0x10203,%k2 \n" 1.5981 + "je 123f \n" 1.5982 + "cmp $0x30201,%k2 \n" 1.5983 + "je 321f \n" 1.5984 + "cmp $0x2010003,%k2 \n" 1.5985 + "je 2103f \n" 1.5986 + 1.5987 + LABELALIGN 1.5988 + "1: \n" 1.5989 + "movzb " MEMACCESS(4) ",%2 \n" 1.5990 + MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 1.5991 + "mov %b2," MEMACCESS(1) " \n" 1.5992 + "movzb " MEMACCESS2(0x1,4) ",%2 \n" 1.5993 + MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 1.5994 + "mov %b2," MEMACCESS2(0x1,1) " \n" 1.5995 + BUNDLEALIGN 1.5996 + "movzb " MEMACCESS2(0x2,4) ",%2 \n" 1.5997 + MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 1.5998 + "mov %b2," MEMACCESS2(0x2,1) " \n" 1.5999 + "movzb " MEMACCESS2(0x3,4) ",%2 \n" 1.6000 + MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 1.6001 + "mov %b2," MEMACCESS2(0x3,1) " \n" 1.6002 + "lea " MEMLEA(0x4,0) ",%0 \n" 1.6003 + "lea " MEMLEA(0x4,1) ",%1 \n" 1.6004 + "sub $0x1,%3 \n" 1.6005 + "jg 1b \n" 1.6006 + "jmp 99f \n" 1.6007 + 1.6008 + LABELALIGN 1.6009 + "123: \n" 1.6010 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.6011 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.6012 + "movdqa %%xmm0,%%xmm1 \n" 1.6013 + "punpcklbw %%xmm5,%%xmm0 \n" 1.6014 + "punpckhbw %%xmm5,%%xmm1 \n" 1.6015 + "pshufhw $0x1b,%%xmm0,%%xmm0 \n" 1.6016 + "pshuflw $0x1b,%%xmm0,%%xmm0 \n" 1.6017 + "pshufhw $0x1b,%%xmm1,%%xmm1 \n" 1.6018 + "pshuflw $0x1b,%%xmm1,%%xmm1 \n" 1.6019 + "packuswb %%xmm1,%%xmm0 \n" 1.6020 + "sub $0x4,%3 \n" 1.6021 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.6022 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.6023 + "jg 123b \n" 1.6024 + "jmp 99f \n" 1.6025 + 1.6026 + LABELALIGN 1.6027 + "321: \n" 1.6028 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.6029 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.6030 + "movdqa %%xmm0,%%xmm1 \n" 1.6031 + "punpcklbw %%xmm5,%%xmm0 \n" 1.6032 + "punpckhbw %%xmm5,%%xmm1 \n" 1.6033 + "pshufhw $0x39,%%xmm0,%%xmm0 \n" 1.6034 + "pshuflw $0x39,%%xmm0,%%xmm0 \n" 1.6035 + "pshufhw $0x39,%%xmm1,%%xmm1 \n" 1.6036 + "pshuflw $0x39,%%xmm1,%%xmm1 \n" 1.6037 + "packuswb %%xmm1,%%xmm0 \n" 1.6038 + "sub $0x4,%3 \n" 1.6039 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.6040 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.6041 + "jg 321b \n" 1.6042 + "jmp 99f \n" 1.6043 + 1.6044 + LABELALIGN 1.6045 + "2103: \n" 1.6046 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.6047 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.6048 + "movdqa %%xmm0,%%xmm1 \n" 1.6049 + "punpcklbw %%xmm5,%%xmm0 \n" 1.6050 + "punpckhbw %%xmm5,%%xmm1 \n" 1.6051 + "pshufhw $0x93,%%xmm0,%%xmm0 \n" 1.6052 + "pshuflw $0x93,%%xmm0,%%xmm0 \n" 1.6053 + "pshufhw $0x93,%%xmm1,%%xmm1 \n" 1.6054 + "pshuflw $0x93,%%xmm1,%%xmm1 \n" 1.6055 + "packuswb %%xmm1,%%xmm0 \n" 1.6056 + "sub $0x4,%3 \n" 1.6057 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.6058 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.6059 + "jg 2103b \n" 1.6060 + "jmp 99f \n" 1.6061 + 1.6062 + LABELALIGN 1.6063 + "3012: \n" 1.6064 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.6065 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.6066 + "movdqa %%xmm0,%%xmm1 \n" 1.6067 + "punpcklbw %%xmm5,%%xmm0 \n" 1.6068 + "punpckhbw %%xmm5,%%xmm1 \n" 1.6069 + "pshufhw $0xc6,%%xmm0,%%xmm0 \n" 1.6070 + "pshuflw $0xc6,%%xmm0,%%xmm0 \n" 1.6071 + "pshufhw $0xc6,%%xmm1,%%xmm1 \n" 1.6072 + "pshuflw $0xc6,%%xmm1,%%xmm1 \n" 1.6073 + "packuswb %%xmm1,%%xmm0 \n" 1.6074 + "sub $0x4,%3 \n" 1.6075 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.6076 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.6077 + "jg 3012b \n" 1.6078 + 1.6079 + "99: \n" 1.6080 + : "+r"(src_argb), // %0 1.6081 + "+r"(dst_argb), // %1 1.6082 + "+d"(pixel_temp), // %2 1.6083 + "+r"(pix) // %3 1.6084 + : "r"(shuffler) // %4 1.6085 + : "memory", "cc" 1.6086 +#if defined(__native_client__) && defined(__x86_64__) 1.6087 + , "r14" 1.6088 +#endif 1.6089 +#if defined(__SSE2__) 1.6090 + , "xmm0", "xmm1", "xmm5" 1.6091 +#endif 1.6092 + ); 1.6093 +} 1.6094 +#endif // HAS_ARGBSHUFFLEROW_SSE2 1.6095 + 1.6096 +#ifdef HAS_I422TOYUY2ROW_SSE2 1.6097 +void I422ToYUY2Row_SSE2(const uint8* src_y, 1.6098 + const uint8* src_u, 1.6099 + const uint8* src_v, 1.6100 + uint8* dst_frame, int width) { 1.6101 + asm volatile ( 1.6102 + "sub %1,%2 \n" 1.6103 + LABELALIGN 1.6104 + "1: \n" 1.6105 + "movq " MEMACCESS(1) ",%%xmm2 \n" 1.6106 + MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 1.6107 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.6108 + "punpcklbw %%xmm3,%%xmm2 \n" 1.6109 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.6110 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.6111 + "movdqa %%xmm0,%%xmm1 \n" 1.6112 + "punpcklbw %%xmm2,%%xmm0 \n" 1.6113 + "punpckhbw %%xmm2,%%xmm1 \n" 1.6114 + "movdqu %%xmm0," MEMACCESS(3) " \n" 1.6115 + "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n" 1.6116 + "lea " MEMLEA(0x20,3) ",%3 \n" 1.6117 + "sub $0x10,%4 \n" 1.6118 + "jg 1b \n" 1.6119 + : "+r"(src_y), // %0 1.6120 + "+r"(src_u), // %1 1.6121 + "+r"(src_v), // %2 1.6122 + "+r"(dst_frame), // %3 1.6123 + "+rm"(width) // %4 1.6124 + : 1.6125 + : "memory", "cc" 1.6126 +#if defined(__native_client__) && defined(__x86_64__) 1.6127 + , "r14" 1.6128 +#endif 1.6129 +#if defined(__SSE2__) 1.6130 + , "xmm0", "xmm1", "xmm2", "xmm3" 1.6131 +#endif 1.6132 + ); 1.6133 +} 1.6134 +#endif // HAS_I422TOYUY2ROW_SSE2 1.6135 + 1.6136 +#ifdef HAS_I422TOUYVYROW_SSE2 1.6137 +void I422ToUYVYRow_SSE2(const uint8* src_y, 1.6138 + const uint8* src_u, 1.6139 + const uint8* src_v, 1.6140 + uint8* dst_frame, int width) { 1.6141 + asm volatile ( 1.6142 + "sub %1,%2 \n" 1.6143 + LABELALIGN 1.6144 + "1: \n" 1.6145 + "movq " MEMACCESS(1) ",%%xmm2 \n" 1.6146 + MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 1.6147 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.6148 + "punpcklbw %%xmm3,%%xmm2 \n" 1.6149 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.6150 + "movdqa %%xmm2,%%xmm1 \n" 1.6151 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.6152 + "punpcklbw %%xmm0,%%xmm1 \n" 1.6153 + "punpckhbw %%xmm0,%%xmm2 \n" 1.6154 + "movdqu %%xmm1," MEMACCESS(3) " \n" 1.6155 + "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n" 1.6156 + "lea " MEMLEA(0x20,3) ",%3 \n" 1.6157 + "sub $0x10,%4 \n" 1.6158 + "jg 1b \n" 1.6159 + : "+r"(src_y), // %0 1.6160 + "+r"(src_u), // %1 1.6161 + "+r"(src_v), // %2 1.6162 + "+r"(dst_frame), // %3 1.6163 + "+rm"(width) // %4 1.6164 + : 1.6165 + : "memory", "cc" 1.6166 +#if defined(__native_client__) && defined(__x86_64__) 1.6167 + , "r14" 1.6168 +#endif 1.6169 +#if defined(__SSE2__) 1.6170 + , "xmm0", "xmm1", "xmm2", "xmm3" 1.6171 +#endif 1.6172 + ); 1.6173 +} 1.6174 +#endif // HAS_I422TOUYVYROW_SSE2 1.6175 + 1.6176 +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 1.6177 +void ARGBPolynomialRow_SSE2(const uint8* src_argb, 1.6178 + uint8* dst_argb, const float* poly, 1.6179 + int width) { 1.6180 + asm volatile ( 1.6181 + "pxor %%xmm3,%%xmm3 \n" 1.6182 + 1.6183 + // 2 pixel loop. 1.6184 + LABELALIGN 1.6185 + "1: \n" 1.6186 + "movq " MEMACCESS(0) ",%%xmm0 \n" 1.6187 + "lea " MEMLEA(0x8,0) ",%0 \n" 1.6188 + "punpcklbw %%xmm3,%%xmm0 \n" 1.6189 + "movdqa %%xmm0,%%xmm4 \n" 1.6190 + "punpcklwd %%xmm3,%%xmm0 \n" 1.6191 + "punpckhwd %%xmm3,%%xmm4 \n" 1.6192 + "cvtdq2ps %%xmm0,%%xmm0 \n" 1.6193 + "cvtdq2ps %%xmm4,%%xmm4 \n" 1.6194 + "movdqa %%xmm0,%%xmm1 \n" 1.6195 + "movdqa %%xmm4,%%xmm5 \n" 1.6196 + "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n" 1.6197 + "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n" 1.6198 + "addps " MEMACCESS(3) ",%%xmm0 \n" 1.6199 + "addps " MEMACCESS(3) ",%%xmm4 \n" 1.6200 + "movdqa %%xmm1,%%xmm2 \n" 1.6201 + "movdqa %%xmm5,%%xmm6 \n" 1.6202 + "mulps %%xmm1,%%xmm2 \n" 1.6203 + "mulps %%xmm5,%%xmm6 \n" 1.6204 + "mulps %%xmm2,%%xmm1 \n" 1.6205 + "mulps %%xmm6,%%xmm5 \n" 1.6206 + "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n" 1.6207 + "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n" 1.6208 + "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n" 1.6209 + "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n" 1.6210 + "addps %%xmm2,%%xmm0 \n" 1.6211 + "addps %%xmm6,%%xmm4 \n" 1.6212 + "addps %%xmm1,%%xmm0 \n" 1.6213 + "addps %%xmm5,%%xmm4 \n" 1.6214 + "cvttps2dq %%xmm0,%%xmm0 \n" 1.6215 + "cvttps2dq %%xmm4,%%xmm4 \n" 1.6216 + "packuswb %%xmm4,%%xmm0 \n" 1.6217 + "packuswb %%xmm0,%%xmm0 \n" 1.6218 + "sub $0x2,%2 \n" 1.6219 + "movq %%xmm0," MEMACCESS(1) " \n" 1.6220 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.6221 + "jg 1b \n" 1.6222 + : "+r"(src_argb), // %0 1.6223 + "+r"(dst_argb), // %1 1.6224 + "+r"(width) // %2 1.6225 + : "r"(poly) // %3 1.6226 + : "memory", "cc" 1.6227 +#if defined(__SSE2__) 1.6228 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 1.6229 +#endif 1.6230 + ); 1.6231 +} 1.6232 +#endif // HAS_ARGBPOLYNOMIALROW_SSE2 1.6233 + 1.6234 +#ifdef HAS_ARGBPOLYNOMIALROW_AVX2 1.6235 +void ARGBPolynomialRow_AVX2(const uint8* src_argb, 1.6236 + uint8* dst_argb, const float* poly, 1.6237 + int width) { 1.6238 + asm volatile ( 1.6239 + "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" 1.6240 + "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" 1.6241 + "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" 1.6242 + "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" 1.6243 + 1.6244 + // 2 pixel loop. 1.6245 + LABELALIGN 1.6246 + "1: \n" 1.6247 + "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels 1.6248 + "lea " MEMLEA(0x8,0) ",%0 \n" 1.6249 + "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats 1.6250 + "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X 1.6251 + "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X 1.6252 + "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X 1.6253 + "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X 1.6254 + "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X 1.6255 + "vcvttps2dq %%ymm0,%%ymm0 \n" 1.6256 + "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" 1.6257 + "vpermq $0xd8,%%ymm0,%%ymm0 \n" 1.6258 + "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" 1.6259 + "sub $0x2,%2 \n" 1.6260 + "vmovq %%xmm0," MEMACCESS(1) " \n" 1.6261 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.6262 + "jg 1b \n" 1.6263 + "vzeroupper \n" 1.6264 + : "+r"(src_argb), // %0 1.6265 + "+r"(dst_argb), // %1 1.6266 + "+r"(width) // %2 1.6267 + : "r"(poly) // %3 1.6268 + : "memory", "cc" 1.6269 +#if defined(__SSE2__) 1.6270 +// TODO(fbarchard): declare ymm usage when applicable. 1.6271 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1.6272 +#endif 1.6273 + ); 1.6274 +} 1.6275 +#endif // HAS_ARGBPOLYNOMIALROW_AVX2 1.6276 + 1.6277 +#ifdef HAS_ARGBCOLORTABLEROW_X86 1.6278 +// Tranform ARGB pixels with color table. 1.6279 +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, 1.6280 + int width) { 1.6281 + uintptr_t pixel_temp = 0u; 1.6282 + asm volatile ( 1.6283 + // 1 pixel loop. 1.6284 + LABELALIGN 1.6285 + "1: \n" 1.6286 + "movzb " MEMACCESS(0) ",%1 \n" 1.6287 + "lea " MEMLEA(0x4,0) ",%0 \n" 1.6288 + MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 1.6289 + "mov %b1," MEMACCESS2(-0x4,0) " \n" 1.6290 + "movzb " MEMACCESS2(-0x3,0) ",%1 \n" 1.6291 + MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 1.6292 + "mov %b1," MEMACCESS2(-0x3,0) " \n" 1.6293 + "movzb " MEMACCESS2(-0x2,0) ",%1 \n" 1.6294 + MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 1.6295 + "mov %b1," MEMACCESS2(-0x2,0) " \n" 1.6296 + "movzb " MEMACCESS2(-0x1,0) ",%1 \n" 1.6297 + MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1 1.6298 + "mov %b1," MEMACCESS2(-0x1,0) " \n" 1.6299 + "dec %2 \n" 1.6300 + "jg 1b \n" 1.6301 + : "+r"(dst_argb), // %0 1.6302 + "+d"(pixel_temp), // %1 1.6303 + "+r"(width) // %2 1.6304 + : "r"(table_argb) // %3 1.6305 + : "memory", "cc"); 1.6306 +} 1.6307 +#endif // HAS_ARGBCOLORTABLEROW_X86 1.6308 + 1.6309 +#ifdef HAS_RGBCOLORTABLEROW_X86 1.6310 +// Tranform RGB pixels with color table. 1.6311 +void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { 1.6312 + uintptr_t pixel_temp = 0u; 1.6313 + asm volatile ( 1.6314 + // 1 pixel loop. 1.6315 + LABELALIGN 1.6316 + "1: \n" 1.6317 + "movzb " MEMACCESS(0) ",%1 \n" 1.6318 + "lea " MEMLEA(0x4,0) ",%0 \n" 1.6319 + MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 1.6320 + "mov %b1," MEMACCESS2(-0x4,0) " \n" 1.6321 + "movzb " MEMACCESS2(-0x3,0) ",%1 \n" 1.6322 + MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 1.6323 + "mov %b1," MEMACCESS2(-0x3,0) " \n" 1.6324 + "movzb " MEMACCESS2(-0x2,0) ",%1 \n" 1.6325 + MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 1.6326 + "mov %b1," MEMACCESS2(-0x2,0) " \n" 1.6327 + "dec %2 \n" 1.6328 + "jg 1b \n" 1.6329 + : "+r"(dst_argb), // %0 1.6330 + "+d"(pixel_temp), // %1 1.6331 + "+r"(width) // %2 1.6332 + : "r"(table_argb) // %3 1.6333 + : "memory", "cc"); 1.6334 +} 1.6335 +#endif // HAS_RGBCOLORTABLEROW_X86 1.6336 + 1.6337 +#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 1.6338 +// Tranform RGB pixels with luma table. 1.6339 +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, 1.6340 + int width, 1.6341 + const uint8* luma, uint32 lumacoeff) { 1.6342 + uintptr_t pixel_temp = 0u; 1.6343 + uintptr_t table_temp = 0u; 1.6344 + asm volatile ( 1.6345 + "movd %6,%%xmm3 \n" 1.6346 + "pshufd $0x0,%%xmm3,%%xmm3 \n" 1.6347 + "pcmpeqb %%xmm4,%%xmm4 \n" 1.6348 + "psllw $0x8,%%xmm4 \n" 1.6349 + "pxor %%xmm5,%%xmm5 \n" 1.6350 + 1.6351 + // 4 pixel loop. 1.6352 + LABELALIGN 1.6353 + "1: \n" 1.6354 + "movdqu " MEMACCESS(2) ",%%xmm0 \n" 1.6355 + "pmaddubsw %%xmm3,%%xmm0 \n" 1.6356 + "phaddw %%xmm0,%%xmm0 \n" 1.6357 + "pand %%xmm4,%%xmm0 \n" 1.6358 + "punpcklwd %%xmm5,%%xmm0 \n" 1.6359 + "movd %%xmm0,%k1 \n" // 32 bit offset 1.6360 + "add %5,%1 \n" 1.6361 + "pshufd $0x39,%%xmm0,%%xmm0 \n" 1.6362 + 1.6363 + "movzb " MEMACCESS(2) ",%0 \n" 1.6364 + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 1.6365 + "mov %b0," MEMACCESS(3) " \n" 1.6366 + "movzb " MEMACCESS2(0x1,2) ",%0 \n" 1.6367 + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 1.6368 + "mov %b0," MEMACCESS2(0x1,3) " \n" 1.6369 + "movzb " MEMACCESS2(0x2,2) ",%0 \n" 1.6370 + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 1.6371 + "mov %b0," MEMACCESS2(0x2,3) " \n" 1.6372 + "movzb " MEMACCESS2(0x3,2) ",%0 \n" 1.6373 + "mov %b0," MEMACCESS2(0x3,3) " \n" 1.6374 + 1.6375 + "movd %%xmm0,%k1 \n" // 32 bit offset 1.6376 + "add %5,%1 \n" 1.6377 + "pshufd $0x39,%%xmm0,%%xmm0 \n" 1.6378 + 1.6379 + "movzb " MEMACCESS2(0x4,2) ",%0 \n" 1.6380 + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 1.6381 + "mov %b0," MEMACCESS2(0x4,3) " \n" 1.6382 + BUNDLEALIGN 1.6383 + "movzb " MEMACCESS2(0x5,2) ",%0 \n" 1.6384 + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 1.6385 + "mov %b0," MEMACCESS2(0x5,3) " \n" 1.6386 + "movzb " MEMACCESS2(0x6,2) ",%0 \n" 1.6387 + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 1.6388 + "mov %b0," MEMACCESS2(0x6,3) " \n" 1.6389 + "movzb " MEMACCESS2(0x7,2) ",%0 \n" 1.6390 + "mov %b0," MEMACCESS2(0x7,3) " \n" 1.6391 + 1.6392 + "movd %%xmm0,%k1 \n" // 32 bit offset 1.6393 + "add %5,%1 \n" 1.6394 + "pshufd $0x39,%%xmm0,%%xmm0 \n" 1.6395 + 1.6396 + "movzb " MEMACCESS2(0x8,2) ",%0 \n" 1.6397 + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 1.6398 + "mov %b0," MEMACCESS2(0x8,3) " \n" 1.6399 + "movzb " MEMACCESS2(0x9,2) ",%0 \n" 1.6400 + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 1.6401 + "mov %b0," MEMACCESS2(0x9,3) " \n" 1.6402 + "movzb " MEMACCESS2(0xa,2) ",%0 \n" 1.6403 + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 1.6404 + "mov %b0," MEMACCESS2(0xa,3) " \n" 1.6405 + "movzb " MEMACCESS2(0xb,2) ",%0 \n" 1.6406 + "mov %b0," MEMACCESS2(0xb,3) " \n" 1.6407 + 1.6408 + "movd %%xmm0,%k1 \n" // 32 bit offset 1.6409 + "add %5,%1 \n" 1.6410 + 1.6411 + "movzb " MEMACCESS2(0xc,2) ",%0 \n" 1.6412 + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 1.6413 + "mov %b0," MEMACCESS2(0xc,3) " \n" 1.6414 + "movzb " MEMACCESS2(0xd,2) ",%0 \n" 1.6415 + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 1.6416 + "mov %b0," MEMACCESS2(0xd,3) " \n" 1.6417 + "movzb " MEMACCESS2(0xe,2) ",%0 \n" 1.6418 + MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 1.6419 + "mov %b0," MEMACCESS2(0xe,3) " \n" 1.6420 + "movzb " MEMACCESS2(0xf,2) ",%0 \n" 1.6421 + "mov %b0," MEMACCESS2(0xf,3) " \n" 1.6422 + "sub $0x4,%4 \n" 1.6423 + "lea " MEMLEA(0x10,2) ",%2 \n" 1.6424 + "lea " MEMLEA(0x10,3) ",%3 \n" 1.6425 + "jg 1b \n" 1.6426 + : "+d"(pixel_temp), // %0 1.6427 + "+a"(table_temp), // %1 1.6428 + "+r"(src_argb), // %2 1.6429 + "+r"(dst_argb), // %3 1.6430 + "+rm"(width) // %4 1.6431 + : "r"(luma), // %5 1.6432 + "rm"(lumacoeff) // %6 1.6433 + : "memory", "cc" 1.6434 +#if defined(__SSE2__) 1.6435 + , "xmm0", "xmm3", "xmm4", "xmm5" 1.6436 +#endif 1.6437 + ); 1.6438 +} 1.6439 +#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 1.6440 + 1.6441 +#endif // defined(__x86_64__) || defined(__i386__) 1.6442 + 1.6443 +#ifdef __cplusplus 1.6444 +} // extern "C" 1.6445 +} // namespace libyuv 1.6446 +#endif