media/libyuv/source/row_posix.cc

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
michael@0 3 *
michael@0 4 * Use of this source code is governed by a BSD-style license
michael@0 5 * that can be found in the LICENSE file in the root of the source
michael@0 6 * tree. An additional intellectual property rights grant can be found
michael@0 7 * in the file PATENTS. All contributing project authors may
michael@0 8 * be found in the AUTHORS file in the root of the source tree.
michael@0 9 */
michael@0 10
michael@0 11 #include "libyuv/row.h"
michael@0 12
michael@0 13 #ifdef __cplusplus
michael@0 14 namespace libyuv {
michael@0 15 extern "C" {
michael@0 16 #endif
michael@0 17
michael@0 18 // This module is for GCC x86 and x64.
michael@0 19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
michael@0 20
michael@0 21 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
michael@0 22
michael@0 23 // Constants for ARGB
michael@0 24 static vec8 kARGBToY = {
michael@0 25 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
michael@0 26 };
michael@0 27
michael@0 28 // JPeg full range.
michael@0 29 static vec8 kARGBToYJ = {
michael@0 30 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
michael@0 31 };
michael@0 32 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
michael@0 33
michael@0 34 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
michael@0 35
michael@0 36 static vec8 kARGBToU = {
michael@0 37 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
michael@0 38 };
michael@0 39
michael@0 40 static vec8 kARGBToUJ = {
michael@0 41 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
michael@0 42 };
michael@0 43
michael@0 44 static vec8 kARGBToV = {
michael@0 45 -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
michael@0 46 };
michael@0 47
michael@0 48 static vec8 kARGBToVJ = {
michael@0 49 -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
michael@0 50 };
michael@0 51
michael@0 52 // Constants for BGRA
michael@0 53 static vec8 kBGRAToY = {
michael@0 54 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
michael@0 55 };
michael@0 56
michael@0 57 static vec8 kBGRAToU = {
michael@0 58 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
michael@0 59 };
michael@0 60
michael@0 61 static vec8 kBGRAToV = {
michael@0 62 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
michael@0 63 };
michael@0 64
michael@0 65 // Constants for ABGR
michael@0 66 static vec8 kABGRToY = {
michael@0 67 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
michael@0 68 };
michael@0 69
michael@0 70 static vec8 kABGRToU = {
michael@0 71 -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
michael@0 72 };
michael@0 73
michael@0 74 static vec8 kABGRToV = {
michael@0 75 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
michael@0 76 };
michael@0 77
michael@0 78 // Constants for RGBA.
michael@0 79 static vec8 kRGBAToY = {
michael@0 80 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
michael@0 81 };
michael@0 82
michael@0 83 static vec8 kRGBAToU = {
michael@0 84 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
michael@0 85 };
michael@0 86
michael@0 87 static vec8 kRGBAToV = {
michael@0 88 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
michael@0 89 };
michael@0 90
michael@0 91 static uvec8 kAddY16 = {
michael@0 92 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
michael@0 93 };
michael@0 94
michael@0 95 static vec16 kAddYJ64 = {
michael@0 96 64, 64, 64, 64, 64, 64, 64, 64
michael@0 97 };
michael@0 98
michael@0 99 static uvec8 kAddUV128 = {
michael@0 100 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
michael@0 101 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
michael@0 102 };
michael@0 103
michael@0 104 static uvec16 kAddUVJ128 = {
michael@0 105 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
michael@0 106 };
michael@0 107 #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
michael@0 108
michael@0 109 #ifdef HAS_RGB24TOARGBROW_SSSE3
michael@0 110
michael@0 111 // Shuffle table for converting RGB24 to ARGB.
michael@0 112 static uvec8 kShuffleMaskRGB24ToARGB = {
michael@0 113 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
michael@0 114 };
michael@0 115
michael@0 116 // Shuffle table for converting RAW to ARGB.
michael@0 117 static uvec8 kShuffleMaskRAWToARGB = {
michael@0 118 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
michael@0 119 };
michael@0 120
michael@0 121 // Shuffle table for converting ARGB to RGB24.
michael@0 122 static uvec8 kShuffleMaskARGBToRGB24 = {
michael@0 123 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
michael@0 124 };
michael@0 125
michael@0 126 // Shuffle table for converting ARGB to RAW.
michael@0 127 static uvec8 kShuffleMaskARGBToRAW = {
michael@0 128 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
michael@0 129 };
michael@0 130
michael@0 131 // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4
michael@0 132 static uvec8 kShuffleMaskARGBToRGB24_0 = {
michael@0 133 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
michael@0 134 };
michael@0 135
michael@0 136 // Shuffle table for converting ARGB to RAW.
michael@0 137 static uvec8 kShuffleMaskARGBToRAW_0 = {
michael@0 138 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
michael@0 139 };
michael@0 140 #endif // HAS_RGB24TOARGBROW_SSSE3
michael@0 141
michael@0 142 #if defined(TESTING) && defined(__x86_64__)
michael@0 143 void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
michael@0 144 asm volatile (
michael@0 145 ".p2align 5 \n"
michael@0 146 "mov %%eax,%%eax \n"
michael@0 147 "mov %%ebx,%%ebx \n"
michael@0 148 "mov %%ecx,%%ecx \n"
michael@0 149 "mov %%edx,%%edx \n"
michael@0 150 "mov %%esi,%%esi \n"
michael@0 151 "mov %%edi,%%edi \n"
michael@0 152 "mov %%ebp,%%ebp \n"
michael@0 153 "mov %%esp,%%esp \n"
michael@0 154 ".p2align 5 \n"
michael@0 155 "mov %%r8d,%%r8d \n"
michael@0 156 "mov %%r9d,%%r9d \n"
michael@0 157 "mov %%r10d,%%r10d \n"
michael@0 158 "mov %%r11d,%%r11d \n"
michael@0 159 "mov %%r12d,%%r12d \n"
michael@0 160 "mov %%r13d,%%r13d \n"
michael@0 161 "mov %%r14d,%%r14d \n"
michael@0 162 "mov %%r15d,%%r15d \n"
michael@0 163 ".p2align 5 \n"
michael@0 164 "lea (%%rax),%%eax \n"
michael@0 165 "lea (%%rbx),%%ebx \n"
michael@0 166 "lea (%%rcx),%%ecx \n"
michael@0 167 "lea (%%rdx),%%edx \n"
michael@0 168 "lea (%%rsi),%%esi \n"
michael@0 169 "lea (%%rdi),%%edi \n"
michael@0 170 "lea (%%rbp),%%ebp \n"
michael@0 171 "lea (%%rsp),%%esp \n"
michael@0 172 ".p2align 5 \n"
michael@0 173 "lea (%%r8),%%r8d \n"
michael@0 174 "lea (%%r9),%%r9d \n"
michael@0 175 "lea (%%r10),%%r10d \n"
michael@0 176 "lea (%%r11),%%r11d \n"
michael@0 177 "lea (%%r12),%%r12d \n"
michael@0 178 "lea (%%r13),%%r13d \n"
michael@0 179 "lea (%%r14),%%r14d \n"
michael@0 180 "lea (%%r15),%%r15d \n"
michael@0 181
michael@0 182 ".p2align 5 \n"
michael@0 183 "lea 0x10(%%rax),%%eax \n"
michael@0 184 "lea 0x10(%%rbx),%%ebx \n"
michael@0 185 "lea 0x10(%%rcx),%%ecx \n"
michael@0 186 "lea 0x10(%%rdx),%%edx \n"
michael@0 187 "lea 0x10(%%rsi),%%esi \n"
michael@0 188 "lea 0x10(%%rdi),%%edi \n"
michael@0 189 "lea 0x10(%%rbp),%%ebp \n"
michael@0 190 "lea 0x10(%%rsp),%%esp \n"
michael@0 191 ".p2align 5 \n"
michael@0 192 "lea 0x10(%%r8),%%r8d \n"
michael@0 193 "lea 0x10(%%r9),%%r9d \n"
michael@0 194 "lea 0x10(%%r10),%%r10d \n"
michael@0 195 "lea 0x10(%%r11),%%r11d \n"
michael@0 196 "lea 0x10(%%r12),%%r12d \n"
michael@0 197 "lea 0x10(%%r13),%%r13d \n"
michael@0 198 "lea 0x10(%%r14),%%r14d \n"
michael@0 199 "lea 0x10(%%r15),%%r15d \n"
michael@0 200
michael@0 201 ".p2align 5 \n"
michael@0 202 "add 0x10,%%eax \n"
michael@0 203 "add 0x10,%%ebx \n"
michael@0 204 "add 0x10,%%ecx \n"
michael@0 205 "add 0x10,%%edx \n"
michael@0 206 "add 0x10,%%esi \n"
michael@0 207 "add 0x10,%%edi \n"
michael@0 208 "add 0x10,%%ebp \n"
michael@0 209 "add 0x10,%%esp \n"
michael@0 210 ".p2align 5 \n"
michael@0 211 "add 0x10,%%r8d \n"
michael@0 212 "add 0x10,%%r9d \n"
michael@0 213 "add 0x10,%%r10d \n"
michael@0 214 "add 0x10,%%r11d \n"
michael@0 215 "add 0x10,%%r12d \n"
michael@0 216 "add 0x10,%%r13d \n"
michael@0 217 "add 0x10,%%r14d \n"
michael@0 218 "add 0x10,%%r15d \n"
michael@0 219
michael@0 220 ".p2align 2 \n"
michael@0 221 "1: \n"
michael@0 222 "movq " MEMACCESS(0) ",%%xmm0 \n"
michael@0 223 "lea " MEMLEA(0x8,0) ",%0 \n"
michael@0 224 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 225 "lea " MEMLEA(0x20,1) ",%1 \n"
michael@0 226 "sub $0x8,%2 \n"
michael@0 227 "jg 1b \n"
michael@0 228 : "+r"(src_y), // %0
michael@0 229 "+r"(dst_argb), // %1
michael@0 230 "+r"(pix) // %2
michael@0 231 :
michael@0 232 : "memory", "cc"
michael@0 233 #if defined(__SSE2__)
michael@0 234 , "xmm0", "xmm1", "xmm5"
michael@0 235 #endif
michael@0 236 );
michael@0 237 }
michael@0 238 #endif // TESTING
michael@0 239
michael@0 240 #ifdef HAS_I400TOARGBROW_SSE2
michael@0 241 void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
michael@0 242 asm volatile (
michael@0 243 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 244 "pslld $0x18,%%xmm5 \n"
michael@0 245 LABELALIGN
michael@0 246 "1: \n"
michael@0 247 "movq " MEMACCESS(0) ",%%xmm0 \n"
michael@0 248 "lea " MEMLEA(0x8,0) ",%0 \n"
michael@0 249 "punpcklbw %%xmm0,%%xmm0 \n"
michael@0 250 "movdqa %%xmm0,%%xmm1 \n"
michael@0 251 "punpcklwd %%xmm0,%%xmm0 \n"
michael@0 252 "punpckhwd %%xmm1,%%xmm1 \n"
michael@0 253 "por %%xmm5,%%xmm0 \n"
michael@0 254 "por %%xmm5,%%xmm1 \n"
michael@0 255 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 256 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
michael@0 257 "lea " MEMLEA(0x20,1) ",%1 \n"
michael@0 258 "sub $0x8,%2 \n"
michael@0 259 "jg 1b \n"
michael@0 260 : "+r"(src_y), // %0
michael@0 261 "+r"(dst_argb), // %1
michael@0 262 "+r"(pix) // %2
michael@0 263 :
michael@0 264 : "memory", "cc"
michael@0 265 #if defined(__SSE2__)
michael@0 266 , "xmm0", "xmm1", "xmm5"
michael@0 267 #endif
michael@0 268 );
michael@0 269 }
michael@0 270
michael@0 271 void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
michael@0 272 int pix) {
michael@0 273 asm volatile (
michael@0 274 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 275 "pslld $0x18,%%xmm5 \n"
michael@0 276 LABELALIGN
michael@0 277 "1: \n"
michael@0 278 "movq " MEMACCESS(0) ",%%xmm0 \n"
michael@0 279 "lea " MEMLEA(0x8,0) ",%0 \n"
michael@0 280 "punpcklbw %%xmm0,%%xmm0 \n"
michael@0 281 "movdqa %%xmm0,%%xmm1 \n"
michael@0 282 "punpcklwd %%xmm0,%%xmm0 \n"
michael@0 283 "punpckhwd %%xmm1,%%xmm1 \n"
michael@0 284 "por %%xmm5,%%xmm0 \n"
michael@0 285 "por %%xmm5,%%xmm1 \n"
michael@0 286 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 287 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
michael@0 288 "lea " MEMLEA(0x20,1) ",%1 \n"
michael@0 289 "sub $0x8,%2 \n"
michael@0 290 "jg 1b \n"
michael@0 291 : "+r"(src_y), // %0
michael@0 292 "+r"(dst_argb), // %1
michael@0 293 "+r"(pix) // %2
michael@0 294 :
michael@0 295 : "memory", "cc"
michael@0 296 #if defined(__SSE2__)
michael@0 297 , "xmm0", "xmm1", "xmm5"
michael@0 298 #endif
michael@0 299 );
michael@0 300 }
michael@0 301 #endif // HAS_I400TOARGBROW_SSE2
michael@0 302
michael@0 303 #ifdef HAS_RGB24TOARGBROW_SSSE3
michael@0 304 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
michael@0 305 asm volatile (
michael@0 306 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
michael@0 307 "pslld $0x18,%%xmm5 \n"
michael@0 308 "movdqa %3,%%xmm4 \n"
michael@0 309 LABELALIGN
michael@0 310 "1: \n"
michael@0 311 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 312 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 313 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
michael@0 314 "lea " MEMLEA(0x30,0) ",%0 \n"
michael@0 315 "movdqa %%xmm3,%%xmm2 \n"
michael@0 316 "palignr $0x8,%%xmm1,%%xmm2 \n"
michael@0 317 "pshufb %%xmm4,%%xmm2 \n"
michael@0 318 "por %%xmm5,%%xmm2 \n"
michael@0 319 "palignr $0xc,%%xmm0,%%xmm1 \n"
michael@0 320 "pshufb %%xmm4,%%xmm0 \n"
michael@0 321 "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n"
michael@0 322 "por %%xmm5,%%xmm0 \n"
michael@0 323 "pshufb %%xmm4,%%xmm1 \n"
michael@0 324 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 325 "por %%xmm5,%%xmm1 \n"
michael@0 326 "palignr $0x4,%%xmm3,%%xmm3 \n"
michael@0 327 "pshufb %%xmm4,%%xmm3 \n"
michael@0 328 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
michael@0 329 "por %%xmm5,%%xmm3 \n"
michael@0 330 "sub $0x10,%2 \n"
michael@0 331 "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n"
michael@0 332 "lea " MEMLEA(0x40,1) ",%1 \n"
michael@0 333 "jg 1b \n"
michael@0 334 : "+r"(src_rgb24), // %0
michael@0 335 "+r"(dst_argb), // %1
michael@0 336 "+r"(pix) // %2
michael@0 337 : "m"(kShuffleMaskRGB24ToARGB) // %3
michael@0 338 : "memory", "cc"
michael@0 339 #if defined(__SSE2__)
michael@0 340 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 341 #endif
michael@0 342 );
michael@0 343 }
michael@0 344
michael@0 345 void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
michael@0 346 asm volatile (
michael@0 347 "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
michael@0 348 "pslld $0x18,%%xmm5 \n"
michael@0 349 "movdqa %3,%%xmm4 \n"
michael@0 350 LABELALIGN
michael@0 351 "1: \n"
michael@0 352 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 353 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 354 "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n"
michael@0 355 "lea " MEMLEA(0x30,0) ",%0 \n"
michael@0 356 "movdqa %%xmm3,%%xmm2 \n"
michael@0 357 "palignr $0x8,%%xmm1,%%xmm2 \n"
michael@0 358 "pshufb %%xmm4,%%xmm2 \n"
michael@0 359 "por %%xmm5,%%xmm2 \n"
michael@0 360 "palignr $0xc,%%xmm0,%%xmm1 \n"
michael@0 361 "pshufb %%xmm4,%%xmm0 \n"
michael@0 362 "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n"
michael@0 363 "por %%xmm5,%%xmm0 \n"
michael@0 364 "pshufb %%xmm4,%%xmm1 \n"
michael@0 365 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 366 "por %%xmm5,%%xmm1 \n"
michael@0 367 "palignr $0x4,%%xmm3,%%xmm3 \n"
michael@0 368 "pshufb %%xmm4,%%xmm3 \n"
michael@0 369 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
michael@0 370 "por %%xmm5,%%xmm3 \n"
michael@0 371 "sub $0x10,%2 \n"
michael@0 372 "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n"
michael@0 373 "lea " MEMLEA(0x40,1) ",%1 \n"
michael@0 374 "jg 1b \n"
michael@0 375 : "+r"(src_raw), // %0
michael@0 376 "+r"(dst_argb), // %1
michael@0 377 "+r"(pix) // %2
michael@0 378 : "m"(kShuffleMaskRAWToARGB) // %3
michael@0 379 : "memory", "cc"
michael@0 380 #if defined(__SSE2__)
michael@0 381 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 382 #endif
michael@0 383 );
michael@0 384 }
michael@0 385
michael@0 386 void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
michael@0 387 asm volatile (
michael@0 388 "mov $0x1080108,%%eax \n"
michael@0 389 "movd %%eax,%%xmm5 \n"
michael@0 390 "pshufd $0x0,%%xmm5,%%xmm5 \n"
michael@0 391 "mov $0x20802080,%%eax \n"
michael@0 392 "movd %%eax,%%xmm6 \n"
michael@0 393 "pshufd $0x0,%%xmm6,%%xmm6 \n"
michael@0 394 "pcmpeqb %%xmm3,%%xmm3 \n"
michael@0 395 "psllw $0xb,%%xmm3 \n"
michael@0 396 "pcmpeqb %%xmm4,%%xmm4 \n"
michael@0 397 "psllw $0xa,%%xmm4 \n"
michael@0 398 "psrlw $0x5,%%xmm4 \n"
michael@0 399 "pcmpeqb %%xmm7,%%xmm7 \n"
michael@0 400 "psllw $0x8,%%xmm7 \n"
michael@0 401 "sub %0,%1 \n"
michael@0 402 "sub %0,%1 \n"
michael@0 403 LABELALIGN
michael@0 404 "1: \n"
michael@0 405 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 406 "movdqa %%xmm0,%%xmm1 \n"
michael@0 407 "movdqa %%xmm0,%%xmm2 \n"
michael@0 408 "pand %%xmm3,%%xmm1 \n"
michael@0 409 "psllw $0xb,%%xmm2 \n"
michael@0 410 "pmulhuw %%xmm5,%%xmm1 \n"
michael@0 411 "pmulhuw %%xmm5,%%xmm2 \n"
michael@0 412 "psllw $0x8,%%xmm1 \n"
michael@0 413 "por %%xmm2,%%xmm1 \n"
michael@0 414 "pand %%xmm4,%%xmm0 \n"
michael@0 415 "pmulhuw %%xmm6,%%xmm0 \n"
michael@0 416 "por %%xmm7,%%xmm0 \n"
michael@0 417 "movdqa %%xmm1,%%xmm2 \n"
michael@0 418 "punpcklbw %%xmm0,%%xmm1 \n"
michael@0 419 "punpckhbw %%xmm0,%%xmm2 \n"
michael@0 420 BUNDLEALIGN
michael@0 421 MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2)
michael@0 422 MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2)
michael@0 423 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 424 "sub $0x8,%2 \n"
michael@0 425 "jg 1b \n"
michael@0 426 : "+r"(src), // %0
michael@0 427 "+r"(dst), // %1
michael@0 428 "+r"(pix) // %2
michael@0 429 :
michael@0 430 : "memory", "cc", "eax"
michael@0 431 #if defined(__native_client__) && defined(__x86_64__)
michael@0 432 , "r14"
michael@0 433 #endif
michael@0 434 #if defined(__SSE2__)
michael@0 435 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
michael@0 436 #endif
michael@0 437 );
michael@0 438 }
michael@0 439
michael@0 440 void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
michael@0 441 asm volatile (
michael@0 442 "mov $0x1080108,%%eax \n"
michael@0 443 "movd %%eax,%%xmm5 \n"
michael@0 444 "pshufd $0x0,%%xmm5,%%xmm5 \n"
michael@0 445 "mov $0x42004200,%%eax \n"
michael@0 446 "movd %%eax,%%xmm6 \n"
michael@0 447 "pshufd $0x0,%%xmm6,%%xmm6 \n"
michael@0 448 "pcmpeqb %%xmm3,%%xmm3 \n"
michael@0 449 "psllw $0xb,%%xmm3 \n"
michael@0 450 "movdqa %%xmm3,%%xmm4 \n"
michael@0 451 "psrlw $0x6,%%xmm4 \n"
michael@0 452 "pcmpeqb %%xmm7,%%xmm7 \n"
michael@0 453 "psllw $0x8,%%xmm7 \n"
michael@0 454 "sub %0,%1 \n"
michael@0 455 "sub %0,%1 \n"
michael@0 456 LABELALIGN
michael@0 457 "1: \n"
michael@0 458 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 459 "movdqa %%xmm0,%%xmm1 \n"
michael@0 460 "movdqa %%xmm0,%%xmm2 \n"
michael@0 461 "psllw $0x1,%%xmm1 \n"
michael@0 462 "psllw $0xb,%%xmm2 \n"
michael@0 463 "pand %%xmm3,%%xmm1 \n"
michael@0 464 "pmulhuw %%xmm5,%%xmm2 \n"
michael@0 465 "pmulhuw %%xmm5,%%xmm1 \n"
michael@0 466 "psllw $0x8,%%xmm1 \n"
michael@0 467 "por %%xmm2,%%xmm1 \n"
michael@0 468 "movdqa %%xmm0,%%xmm2 \n"
michael@0 469 "pand %%xmm4,%%xmm0 \n"
michael@0 470 "psraw $0x8,%%xmm2 \n"
michael@0 471 "pmulhuw %%xmm6,%%xmm0 \n"
michael@0 472 "pand %%xmm7,%%xmm2 \n"
michael@0 473 "por %%xmm2,%%xmm0 \n"
michael@0 474 "movdqa %%xmm1,%%xmm2 \n"
michael@0 475 "punpcklbw %%xmm0,%%xmm1 \n"
michael@0 476 "punpckhbw %%xmm0,%%xmm2 \n"
michael@0 477 BUNDLEALIGN
michael@0 478 MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2)
michael@0 479 MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2)
michael@0 480 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 481 "sub $0x8,%2 \n"
michael@0 482 "jg 1b \n"
michael@0 483 : "+r"(src), // %0
michael@0 484 "+r"(dst), // %1
michael@0 485 "+r"(pix) // %2
michael@0 486 :
michael@0 487 : "memory", "cc", "eax"
michael@0 488 #if defined(__native_client__) && defined(__x86_64__)
michael@0 489 , "r14"
michael@0 490 #endif
michael@0 491 #if defined(__SSE2__)
michael@0 492 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
michael@0 493 #endif
michael@0 494 );
michael@0 495 }
michael@0 496
michael@0 497 void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
michael@0 498 asm volatile (
michael@0 499 "mov $0xf0f0f0f,%%eax \n"
michael@0 500 "movd %%eax,%%xmm4 \n"
michael@0 501 "pshufd $0x0,%%xmm4,%%xmm4 \n"
michael@0 502 "movdqa %%xmm4,%%xmm5 \n"
michael@0 503 "pslld $0x4,%%xmm5 \n"
michael@0 504 "sub %0,%1 \n"
michael@0 505 "sub %0,%1 \n"
michael@0 506 LABELALIGN
michael@0 507 "1: \n"
michael@0 508 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 509 "movdqa %%xmm0,%%xmm2 \n"
michael@0 510 "pand %%xmm4,%%xmm0 \n"
michael@0 511 "pand %%xmm5,%%xmm2 \n"
michael@0 512 "movdqa %%xmm0,%%xmm1 \n"
michael@0 513 "movdqa %%xmm2,%%xmm3 \n"
michael@0 514 "psllw $0x4,%%xmm1 \n"
michael@0 515 "psrlw $0x4,%%xmm3 \n"
michael@0 516 "por %%xmm1,%%xmm0 \n"
michael@0 517 "por %%xmm3,%%xmm2 \n"
michael@0 518 "movdqa %%xmm0,%%xmm1 \n"
michael@0 519 "punpcklbw %%xmm2,%%xmm0 \n"
michael@0 520 "punpckhbw %%xmm2,%%xmm1 \n"
michael@0 521 BUNDLEALIGN
michael@0 522 MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2)
michael@0 523 MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2)
michael@0 524 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 525 "sub $0x8,%2 \n"
michael@0 526 "jg 1b \n"
michael@0 527 : "+r"(src), // %0
michael@0 528 "+r"(dst), // %1
michael@0 529 "+r"(pix) // %2
michael@0 530 :
michael@0 531 : "memory", "cc", "eax"
michael@0 532 #if defined(__native_client__) && defined(__x86_64__)
michael@0 533 , "r14"
michael@0 534 #endif
michael@0 535 #if defined(__SSE2__)
michael@0 536 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 537 #endif
michael@0 538 );
michael@0 539 }
michael@0 540
michael@0 541 void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
michael@0 542 asm volatile (
michael@0 543 "movdqa %3,%%xmm6 \n"
michael@0 544 LABELALIGN
michael@0 545 "1: \n"
michael@0 546 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 547 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 548 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 549 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
michael@0 550 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 551 "pshufb %%xmm6,%%xmm0 \n"
michael@0 552 "pshufb %%xmm6,%%xmm1 \n"
michael@0 553 "pshufb %%xmm6,%%xmm2 \n"
michael@0 554 "pshufb %%xmm6,%%xmm3 \n"
michael@0 555 "movdqa %%xmm1,%%xmm4 \n"
michael@0 556 "psrldq $0x4,%%xmm1 \n"
michael@0 557 "pslldq $0xc,%%xmm4 \n"
michael@0 558 "movdqa %%xmm2,%%xmm5 \n"
michael@0 559 "por %%xmm4,%%xmm0 \n"
michael@0 560 "pslldq $0x8,%%xmm5 \n"
michael@0 561 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 562 "por %%xmm5,%%xmm1 \n"
michael@0 563 "psrldq $0x8,%%xmm2 \n"
michael@0 564 "pslldq $0x4,%%xmm3 \n"
michael@0 565 "por %%xmm3,%%xmm2 \n"
michael@0 566 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
michael@0 567 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
michael@0 568 "lea " MEMLEA(0x30,1) ",%1 \n"
michael@0 569 "sub $0x10,%2 \n"
michael@0 570 "jg 1b \n"
michael@0 571 : "+r"(src), // %0
michael@0 572 "+r"(dst), // %1
michael@0 573 "+r"(pix) // %2
michael@0 574 : "m"(kShuffleMaskARGBToRGB24) // %3
michael@0 575 : "memory", "cc"
michael@0 576 #if defined(__SSE2__)
michael@0 577 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
michael@0 578 #endif
michael@0 579 );
michael@0 580 }
michael@0 581
michael@0 582 void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
michael@0 583 asm volatile (
michael@0 584 "movdqa %3,%%xmm6 \n"
michael@0 585 LABELALIGN
michael@0 586 "1: \n"
michael@0 587 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 588 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 589 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 590 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
michael@0 591 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 592 "pshufb %%xmm6,%%xmm0 \n"
michael@0 593 "pshufb %%xmm6,%%xmm1 \n"
michael@0 594 "pshufb %%xmm6,%%xmm2 \n"
michael@0 595 "pshufb %%xmm6,%%xmm3 \n"
michael@0 596 "movdqa %%xmm1,%%xmm4 \n"
michael@0 597 "psrldq $0x4,%%xmm1 \n"
michael@0 598 "pslldq $0xc,%%xmm4 \n"
michael@0 599 "movdqa %%xmm2,%%xmm5 \n"
michael@0 600 "por %%xmm4,%%xmm0 \n"
michael@0 601 "pslldq $0x8,%%xmm5 \n"
michael@0 602 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 603 "por %%xmm5,%%xmm1 \n"
michael@0 604 "psrldq $0x8,%%xmm2 \n"
michael@0 605 "pslldq $0x4,%%xmm3 \n"
michael@0 606 "por %%xmm3,%%xmm2 \n"
michael@0 607 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
michael@0 608 "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n"
michael@0 609 "lea " MEMLEA(0x30,1) ",%1 \n"
michael@0 610 "sub $0x10,%2 \n"
michael@0 611 "jg 1b \n"
michael@0 612 : "+r"(src), // %0
michael@0 613 "+r"(dst), // %1
michael@0 614 "+r"(pix) // %2
michael@0 615 : "m"(kShuffleMaskARGBToRAW) // %3
michael@0 616 : "memory", "cc"
michael@0 617 #if defined(__SSE2__)
michael@0 618 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
michael@0 619 #endif
michael@0 620 );
michael@0 621 }
michael@0 622
michael@0 623 void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
michael@0 624 asm volatile (
michael@0 625 "pcmpeqb %%xmm3,%%xmm3 \n"
michael@0 626 "psrld $0x1b,%%xmm3 \n"
michael@0 627 "pcmpeqb %%xmm4,%%xmm4 \n"
michael@0 628 "psrld $0x1a,%%xmm4 \n"
michael@0 629 "pslld $0x5,%%xmm4 \n"
michael@0 630 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 631 "pslld $0xb,%%xmm5 \n"
michael@0 632 LABELALIGN
michael@0 633 "1: \n"
michael@0 634 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 635 "movdqa %%xmm0,%%xmm1 \n"
michael@0 636 "movdqa %%xmm0,%%xmm2 \n"
michael@0 637 "pslld $0x8,%%xmm0 \n"
michael@0 638 "psrld $0x3,%%xmm1 \n"
michael@0 639 "psrld $0x5,%%xmm2 \n"
michael@0 640 "psrad $0x10,%%xmm0 \n"
michael@0 641 "pand %%xmm3,%%xmm1 \n"
michael@0 642 "pand %%xmm4,%%xmm2 \n"
michael@0 643 "pand %%xmm5,%%xmm0 \n"
michael@0 644 "por %%xmm2,%%xmm1 \n"
michael@0 645 "por %%xmm1,%%xmm0 \n"
michael@0 646 "packssdw %%xmm0,%%xmm0 \n"
michael@0 647 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 648 "movq %%xmm0," MEMACCESS(1) " \n"
michael@0 649 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 650 "sub $0x4,%2 \n"
michael@0 651 "jg 1b \n"
michael@0 652 : "+r"(src), // %0
michael@0 653 "+r"(dst), // %1
michael@0 654 "+r"(pix) // %2
michael@0 655 :
michael@0 656 : "memory", "cc"
michael@0 657 #if defined(__SSE2__)
michael@0 658 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 659 #endif
michael@0 660 );
michael@0 661 }
michael@0 662
michael@0 663 void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
michael@0 664 asm volatile (
michael@0 665 "pcmpeqb %%xmm4,%%xmm4 \n"
michael@0 666 "psrld $0x1b,%%xmm4 \n"
michael@0 667 "movdqa %%xmm4,%%xmm5 \n"
michael@0 668 "pslld $0x5,%%xmm5 \n"
michael@0 669 "movdqa %%xmm4,%%xmm6 \n"
michael@0 670 "pslld $0xa,%%xmm6 \n"
michael@0 671 "pcmpeqb %%xmm7,%%xmm7 \n"
michael@0 672 "pslld $0xf,%%xmm7 \n"
michael@0 673 LABELALIGN
michael@0 674 "1: \n"
michael@0 675 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 676 "movdqa %%xmm0,%%xmm1 \n"
michael@0 677 "movdqa %%xmm0,%%xmm2 \n"
michael@0 678 "movdqa %%xmm0,%%xmm3 \n"
michael@0 679 "psrad $0x10,%%xmm0 \n"
michael@0 680 "psrld $0x3,%%xmm1 \n"
michael@0 681 "psrld $0x6,%%xmm2 \n"
michael@0 682 "psrld $0x9,%%xmm3 \n"
michael@0 683 "pand %%xmm7,%%xmm0 \n"
michael@0 684 "pand %%xmm4,%%xmm1 \n"
michael@0 685 "pand %%xmm5,%%xmm2 \n"
michael@0 686 "pand %%xmm6,%%xmm3 \n"
michael@0 687 "por %%xmm1,%%xmm0 \n"
michael@0 688 "por %%xmm3,%%xmm2 \n"
michael@0 689 "por %%xmm2,%%xmm0 \n"
michael@0 690 "packssdw %%xmm0,%%xmm0 \n"
michael@0 691 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 692 "movq %%xmm0," MEMACCESS(1) " \n"
michael@0 693 "lea " MEMACCESS2(0x8,1) ",%1 \n"
michael@0 694 "sub $0x4,%2 \n"
michael@0 695 "jg 1b \n"
michael@0 696 : "+r"(src), // %0
michael@0 697 "+r"(dst), // %1
michael@0 698 "+r"(pix) // %2
michael@0 699 :
michael@0 700 : "memory", "cc"
michael@0 701 #if defined(__SSE2__)
michael@0 702 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
michael@0 703 #endif
michael@0 704 );
michael@0 705 }
michael@0 706
michael@0 707 void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
michael@0 708 asm volatile (
michael@0 709 "pcmpeqb %%xmm4,%%xmm4 \n"
michael@0 710 "psllw $0xc,%%xmm4 \n"
michael@0 711 "movdqa %%xmm4,%%xmm3 \n"
michael@0 712 "psrlw $0x8,%%xmm3 \n"
michael@0 713 LABELALIGN
michael@0 714 "1: \n"
michael@0 715 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 716 "movdqa %%xmm0,%%xmm1 \n"
michael@0 717 "pand %%xmm3,%%xmm0 \n"
michael@0 718 "pand %%xmm4,%%xmm1 \n"
michael@0 719 "psrlq $0x4,%%xmm0 \n"
michael@0 720 "psrlq $0x8,%%xmm1 \n"
michael@0 721 "por %%xmm1,%%xmm0 \n"
michael@0 722 "packuswb %%xmm0,%%xmm0 \n"
michael@0 723 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 724 "movq %%xmm0," MEMACCESS(1) " \n"
michael@0 725 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 726 "sub $0x4,%2 \n"
michael@0 727 "jg 1b \n"
michael@0 728 : "+r"(src), // %0
michael@0 729 "+r"(dst), // %1
michael@0 730 "+r"(pix) // %2
michael@0 731 :
michael@0 732 : "memory", "cc"
michael@0 733 #if defined(__SSE2__)
michael@0 734 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
michael@0 735 #endif
michael@0 736 );
michael@0 737 }
michael@0 738 #endif // HAS_RGB24TOARGBROW_SSSE3
michael@0 739
michael@0 740 #ifdef HAS_ARGBTOYROW_SSSE3
michael@0 741 void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0 742 asm volatile (
michael@0 743 "movdqa %4,%%xmm5 \n"
michael@0 744 "movdqa %3,%%xmm4 \n"
michael@0 745 LABELALIGN
michael@0 746 "1: \n"
michael@0 747 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 748 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 749 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 750 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
michael@0 751 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 752 "pmaddubsw %%xmm4,%%xmm1 \n"
michael@0 753 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 754 "pmaddubsw %%xmm4,%%xmm3 \n"
michael@0 755 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 756 "phaddw %%xmm1,%%xmm0 \n"
michael@0 757 "phaddw %%xmm3,%%xmm2 \n"
michael@0 758 "psrlw $0x7,%%xmm0 \n"
michael@0 759 "psrlw $0x7,%%xmm2 \n"
michael@0 760 "packuswb %%xmm2,%%xmm0 \n"
michael@0 761 "paddb %%xmm5,%%xmm0 \n"
michael@0 762 "sub $0x10,%2 \n"
michael@0 763 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 764 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 765 "jg 1b \n"
michael@0 766 : "+r"(src_argb), // %0
michael@0 767 "+r"(dst_y), // %1
michael@0 768 "+r"(pix) // %2
michael@0 769 : "m"(kARGBToY), // %3
michael@0 770 "m"(kAddY16) // %4
michael@0 771 : "memory", "cc"
michael@0 772 #if defined(__SSE2__)
michael@0 773 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 774 #endif
michael@0 775 );
michael@0 776 }
michael@0 777
michael@0 778 void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0 779 asm volatile (
michael@0 780 "movdqa %4,%%xmm5 \n"
michael@0 781 "movdqa %3,%%xmm4 \n"
michael@0 782 LABELALIGN
michael@0 783 "1: \n"
michael@0 784 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 785 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 786 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 787 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
michael@0 788 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 789 "pmaddubsw %%xmm4,%%xmm1 \n"
michael@0 790 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 791 "pmaddubsw %%xmm4,%%xmm3 \n"
michael@0 792 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 793 "phaddw %%xmm1,%%xmm0 \n"
michael@0 794 "phaddw %%xmm3,%%xmm2 \n"
michael@0 795 "psrlw $0x7,%%xmm0 \n"
michael@0 796 "psrlw $0x7,%%xmm2 \n"
michael@0 797 "packuswb %%xmm2,%%xmm0 \n"
michael@0 798 "paddb %%xmm5,%%xmm0 \n"
michael@0 799 "sub $0x10,%2 \n"
michael@0 800 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 801 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 802 "jg 1b \n"
michael@0 803 : "+r"(src_argb), // %0
michael@0 804 "+r"(dst_y), // %1
michael@0 805 "+r"(pix) // %2
michael@0 806 : "m"(kARGBToY), // %3
michael@0 807 "m"(kAddY16) // %4
michael@0 808 : "memory", "cc"
michael@0 809 #if defined(__SSE2__)
michael@0 810 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 811 #endif
michael@0 812 );
michael@0 813 }
michael@0 814 #endif // HAS_ARGBTOYROW_SSSE3
michael@0 815
michael@0 816 #ifdef HAS_ARGBTOYJROW_SSSE3
michael@0 817 void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0 818 asm volatile (
michael@0 819 "movdqa %3,%%xmm4 \n"
michael@0 820 "movdqa %4,%%xmm5 \n"
michael@0 821 LABELALIGN
michael@0 822 "1: \n"
michael@0 823 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 824 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 825 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 826 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
michael@0 827 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 828 "pmaddubsw %%xmm4,%%xmm1 \n"
michael@0 829 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 830 "pmaddubsw %%xmm4,%%xmm3 \n"
michael@0 831 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 832 "phaddw %%xmm1,%%xmm0 \n"
michael@0 833 "phaddw %%xmm3,%%xmm2 \n"
michael@0 834 "paddw %%xmm5,%%xmm0 \n"
michael@0 835 "paddw %%xmm5,%%xmm2 \n"
michael@0 836 "psrlw $0x7,%%xmm0 \n"
michael@0 837 "psrlw $0x7,%%xmm2 \n"
michael@0 838 "packuswb %%xmm2,%%xmm0 \n"
michael@0 839 "sub $0x10,%2 \n"
michael@0 840 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 841 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 842 "jg 1b \n"
michael@0 843 : "+r"(src_argb), // %0
michael@0 844 "+r"(dst_y), // %1
michael@0 845 "+r"(pix) // %2
michael@0 846 : "m"(kARGBToYJ), // %3
michael@0 847 "m"(kAddYJ64) // %4
michael@0 848 : "memory", "cc"
michael@0 849 #if defined(__SSE2__)
michael@0 850 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 851 #endif
michael@0 852 );
michael@0 853 }
michael@0 854
michael@0 855 void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
michael@0 856 asm volatile (
michael@0 857 "movdqa %3,%%xmm4 \n"
michael@0 858 "movdqa %4,%%xmm5 \n"
michael@0 859 LABELALIGN
michael@0 860 "1: \n"
michael@0 861 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 862 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 863 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 864 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
michael@0 865 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 866 "pmaddubsw %%xmm4,%%xmm1 \n"
michael@0 867 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 868 "pmaddubsw %%xmm4,%%xmm3 \n"
michael@0 869 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 870 "phaddw %%xmm1,%%xmm0 \n"
michael@0 871 "phaddw %%xmm3,%%xmm2 \n"
michael@0 872 "paddw %%xmm5,%%xmm0 \n"
michael@0 873 "paddw %%xmm5,%%xmm2 \n"
michael@0 874 "psrlw $0x7,%%xmm0 \n"
michael@0 875 "psrlw $0x7,%%xmm2 \n"
michael@0 876 "packuswb %%xmm2,%%xmm0 \n"
michael@0 877 "sub $0x10,%2 \n"
michael@0 878 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 879 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 880 "jg 1b \n"
michael@0 881 : "+r"(src_argb), // %0
michael@0 882 "+r"(dst_y), // %1
michael@0 883 "+r"(pix) // %2
michael@0 884 : "m"(kARGBToYJ), // %3
michael@0 885 "m"(kAddYJ64) // %4
michael@0 886 : "memory", "cc"
michael@0 887 #if defined(__SSE2__)
michael@0 888 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 889 #endif
michael@0 890 );
michael@0 891 }
michael@0 892 #endif // HAS_ARGBTOYJROW_SSSE3
michael@0 893
michael@0 894 #ifdef HAS_ARGBTOUVROW_SSSE3
michael@0 895 // TODO(fbarchard): pass xmm constants to single block of assembly.
michael@0 896 // fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
michael@0 897 // 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
michael@0 898 // or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
michael@0 899 // and considered unsafe.
michael@0 900 void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0 901 uint8* dst_u, uint8* dst_v, int width) {
michael@0 902 asm volatile (
michael@0 903 "movdqa %0,%%xmm4 \n"
michael@0 904 "movdqa %1,%%xmm3 \n"
michael@0 905 "movdqa %2,%%xmm5 \n"
michael@0 906 :
michael@0 907 : "m"(kARGBToU), // %0
michael@0 908 "m"(kARGBToV), // %1
michael@0 909 "m"(kAddUV128) // %2
michael@0 910 );
michael@0 911 asm volatile (
michael@0 912 "sub %1,%2 \n"
michael@0 913 LABELALIGN
michael@0 914 "1: \n"
michael@0 915 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 916 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 917 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 918 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
michael@0 919 BUNDLEALIGN
michael@0 920 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
michael@0 921 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
michael@0 922 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
michael@0 923 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
michael@0 924 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 925 "movdqa %%xmm0,%%xmm7 \n"
michael@0 926 "shufps $0x88,%%xmm1,%%xmm0 \n"
michael@0 927 "shufps $0xdd,%%xmm1,%%xmm7 \n"
michael@0 928 "pavgb %%xmm7,%%xmm0 \n"
michael@0 929 "movdqa %%xmm2,%%xmm7 \n"
michael@0 930 "shufps $0x88,%%xmm6,%%xmm2 \n"
michael@0 931 "shufps $0xdd,%%xmm6,%%xmm7 \n"
michael@0 932 "pavgb %%xmm7,%%xmm2 \n"
michael@0 933 "movdqa %%xmm0,%%xmm1 \n"
michael@0 934 "movdqa %%xmm2,%%xmm6 \n"
michael@0 935 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 936 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 937 "pmaddubsw %%xmm3,%%xmm1 \n"
michael@0 938 "pmaddubsw %%xmm3,%%xmm6 \n"
michael@0 939 "phaddw %%xmm2,%%xmm0 \n"
michael@0 940 "phaddw %%xmm6,%%xmm1 \n"
michael@0 941 "psraw $0x8,%%xmm0 \n"
michael@0 942 "psraw $0x8,%%xmm1 \n"
michael@0 943 "packsswb %%xmm1,%%xmm0 \n"
michael@0 944 "paddb %%xmm5,%%xmm0 \n"
michael@0 945 "sub $0x10,%3 \n"
michael@0 946 "movlps %%xmm0," MEMACCESS(1) " \n"
michael@0 947 BUNDLEALIGN
michael@0 948 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
michael@0 949 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 950 "jg 1b \n"
michael@0 951 : "+r"(src_argb0), // %0
michael@0 952 "+r"(dst_u), // %1
michael@0 953 "+r"(dst_v), // %2
michael@0 954 "+rm"(width) // %3
michael@0 955 : "r"((intptr_t)(src_stride_argb)) // %4
michael@0 956 : "memory", "cc"
michael@0 957 #if defined(__native_client__) && defined(__x86_64__)
michael@0 958 , "r14"
michael@0 959 #endif
michael@0 960 #if defined(__SSE2__)
michael@0 961 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
michael@0 962 #endif
michael@0 963 );
michael@0 964 }
michael@0 965
michael@0 966 // TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
michael@0 967 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0 968 uint8* dst_u, uint8* dst_v, int width) {
michael@0 969 asm volatile (
michael@0 970 "movdqa %0,%%xmm4 \n"
michael@0 971 "movdqa %1,%%xmm3 \n"
michael@0 972 "movdqa %2,%%xmm5 \n"
michael@0 973 :
michael@0 974 : "m"(kARGBToUJ), // %0
michael@0 975 "m"(kARGBToVJ), // %1
michael@0 976 "m"(kAddUVJ128) // %2
michael@0 977 );
michael@0 978 asm volatile (
michael@0 979 "sub %1,%2 \n"
michael@0 980 LABELALIGN
michael@0 981 "1: \n"
michael@0 982 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 983 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 984 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 985 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
michael@0 986 BUNDLEALIGN
michael@0 987 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
michael@0 988 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
michael@0 989 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
michael@0 990 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
michael@0 991 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 992 "movdqa %%xmm0,%%xmm7 \n"
michael@0 993 "shufps $0x88,%%xmm1,%%xmm0 \n"
michael@0 994 "shufps $0xdd,%%xmm1,%%xmm7 \n"
michael@0 995 "pavgb %%xmm7,%%xmm0 \n"
michael@0 996 "movdqa %%xmm2,%%xmm7 \n"
michael@0 997 "shufps $0x88,%%xmm6,%%xmm2 \n"
michael@0 998 "shufps $0xdd,%%xmm6,%%xmm7 \n"
michael@0 999 "pavgb %%xmm7,%%xmm2 \n"
michael@0 1000 "movdqa %%xmm0,%%xmm1 \n"
michael@0 1001 "movdqa %%xmm2,%%xmm6 \n"
michael@0 1002 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 1003 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 1004 "pmaddubsw %%xmm3,%%xmm1 \n"
michael@0 1005 "pmaddubsw %%xmm3,%%xmm6 \n"
michael@0 1006 "phaddw %%xmm2,%%xmm0 \n"
michael@0 1007 "phaddw %%xmm6,%%xmm1 \n"
michael@0 1008 "paddw %%xmm5,%%xmm0 \n"
michael@0 1009 "paddw %%xmm5,%%xmm1 \n"
michael@0 1010 "psraw $0x8,%%xmm0 \n"
michael@0 1011 "psraw $0x8,%%xmm1 \n"
michael@0 1012 "packsswb %%xmm1,%%xmm0 \n"
michael@0 1013 "sub $0x10,%3 \n"
michael@0 1014 "movlps %%xmm0," MEMACCESS(1) " \n"
michael@0 1015 BUNDLEALIGN
michael@0 1016 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
michael@0 1017 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 1018 "jg 1b \n"
michael@0 1019 : "+r"(src_argb0), // %0
michael@0 1020 "+r"(dst_u), // %1
michael@0 1021 "+r"(dst_v), // %2
michael@0 1022 "+rm"(width) // %3
michael@0 1023 : "r"((intptr_t)(src_stride_argb)) // %4
michael@0 1024 : "memory", "cc"
michael@0 1025 #if defined(__native_client__) && defined(__x86_64__)
michael@0 1026 , "r14"
michael@0 1027 #endif
michael@0 1028 #if defined(__SSE2__)
michael@0 1029 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
michael@0 1030 #endif
michael@0 1031 );
michael@0 1032 }
michael@0 1033
michael@0 1034 void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0 1035 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1036 asm volatile (
michael@0 1037 "movdqa %0,%%xmm4 \n"
michael@0 1038 "movdqa %1,%%xmm3 \n"
michael@0 1039 "movdqa %2,%%xmm5 \n"
michael@0 1040 :
michael@0 1041 : "m"(kARGBToU), // %0
michael@0 1042 "m"(kARGBToV), // %1
michael@0 1043 "m"(kAddUV128) // %2
michael@0 1044 );
michael@0 1045 asm volatile (
michael@0 1046 "sub %1,%2 \n"
michael@0 1047 LABELALIGN
michael@0 1048 "1: \n"
michael@0 1049 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1050 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1051 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1052 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
michael@0 1053 BUNDLEALIGN
michael@0 1054 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
michael@0 1055 "pavgb %%xmm7,%%xmm0 \n"
michael@0 1056 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
michael@0 1057 "pavgb %%xmm7,%%xmm1 \n"
michael@0 1058 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
michael@0 1059 "pavgb %%xmm7,%%xmm2 \n"
michael@0 1060 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
michael@0 1061 "pavgb %%xmm7,%%xmm6 \n"
michael@0 1062 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 1063 "movdqa %%xmm0,%%xmm7 \n"
michael@0 1064 "shufps $0x88,%%xmm1,%%xmm0 \n"
michael@0 1065 "shufps $0xdd,%%xmm1,%%xmm7 \n"
michael@0 1066 "pavgb %%xmm7,%%xmm0 \n"
michael@0 1067 "movdqa %%xmm2,%%xmm7 \n"
michael@0 1068 "shufps $0x88,%%xmm6,%%xmm2 \n"
michael@0 1069 "shufps $0xdd,%%xmm6,%%xmm7 \n"
michael@0 1070 "pavgb %%xmm7,%%xmm2 \n"
michael@0 1071 "movdqa %%xmm0,%%xmm1 \n"
michael@0 1072 "movdqa %%xmm2,%%xmm6 \n"
michael@0 1073 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 1074 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 1075 "pmaddubsw %%xmm3,%%xmm1 \n"
michael@0 1076 "pmaddubsw %%xmm3,%%xmm6 \n"
michael@0 1077 "phaddw %%xmm2,%%xmm0 \n"
michael@0 1078 "phaddw %%xmm6,%%xmm1 \n"
michael@0 1079 "psraw $0x8,%%xmm0 \n"
michael@0 1080 "psraw $0x8,%%xmm1 \n"
michael@0 1081 "packsswb %%xmm1,%%xmm0 \n"
michael@0 1082 "paddb %%xmm5,%%xmm0 \n"
michael@0 1083 "sub $0x10,%3 \n"
michael@0 1084 "movlps %%xmm0," MEMACCESS(1) " \n"
michael@0 1085 BUNDLEALIGN
michael@0 1086 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
michael@0 1087 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 1088 "jg 1b \n"
michael@0 1089 : "+r"(src_argb0), // %0
michael@0 1090 "+r"(dst_u), // %1
michael@0 1091 "+r"(dst_v), // %2
michael@0 1092 "+rm"(width) // %3
michael@0 1093 : "r"((intptr_t)(src_stride_argb)) // %4
michael@0 1094 : "memory", "cc"
michael@0 1095 #if defined(__native_client__) && defined(__x86_64__)
michael@0 1096 , "r14"
michael@0 1097 #endif
michael@0 1098 #if defined(__SSE2__)
michael@0 1099 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
michael@0 1100 #endif
michael@0 1101 );
michael@0 1102 }
michael@0 1103
michael@0 1104 void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
michael@0 1105 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1106 asm volatile (
michael@0 1107 "movdqa %0,%%xmm4 \n"
michael@0 1108 "movdqa %1,%%xmm3 \n"
michael@0 1109 "movdqa %2,%%xmm5 \n"
michael@0 1110 :
michael@0 1111 : "m"(kARGBToUJ), // %0
michael@0 1112 "m"(kARGBToVJ), // %1
michael@0 1113 "m"(kAddUVJ128) // %2
michael@0 1114 );
michael@0 1115 asm volatile (
michael@0 1116 "sub %1,%2 \n"
michael@0 1117 LABELALIGN
michael@0 1118 "1: \n"
michael@0 1119 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1120 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1121 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1122 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
michael@0 1123 BUNDLEALIGN
michael@0 1124 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
michael@0 1125 "pavgb %%xmm7,%%xmm0 \n"
michael@0 1126 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
michael@0 1127 "pavgb %%xmm7,%%xmm1 \n"
michael@0 1128 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
michael@0 1129 "pavgb %%xmm7,%%xmm2 \n"
michael@0 1130 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
michael@0 1131 "pavgb %%xmm7,%%xmm6 \n"
michael@0 1132 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 1133 "movdqa %%xmm0,%%xmm7 \n"
michael@0 1134 "shufps $0x88,%%xmm1,%%xmm0 \n"
michael@0 1135 "shufps $0xdd,%%xmm1,%%xmm7 \n"
michael@0 1136 "pavgb %%xmm7,%%xmm0 \n"
michael@0 1137 "movdqa %%xmm2,%%xmm7 \n"
michael@0 1138 "shufps $0x88,%%xmm6,%%xmm2 \n"
michael@0 1139 "shufps $0xdd,%%xmm6,%%xmm7 \n"
michael@0 1140 "pavgb %%xmm7,%%xmm2 \n"
michael@0 1141 "movdqa %%xmm0,%%xmm1 \n"
michael@0 1142 "movdqa %%xmm2,%%xmm6 \n"
michael@0 1143 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 1144 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 1145 "pmaddubsw %%xmm3,%%xmm1 \n"
michael@0 1146 "pmaddubsw %%xmm3,%%xmm6 \n"
michael@0 1147 "phaddw %%xmm2,%%xmm0 \n"
michael@0 1148 "phaddw %%xmm6,%%xmm1 \n"
michael@0 1149 "paddw %%xmm5,%%xmm0 \n"
michael@0 1150 "paddw %%xmm5,%%xmm1 \n"
michael@0 1151 "psraw $0x8,%%xmm0 \n"
michael@0 1152 "psraw $0x8,%%xmm1 \n"
michael@0 1153 "packsswb %%xmm1,%%xmm0 \n"
michael@0 1154 "sub $0x10,%3 \n"
michael@0 1155 "movlps %%xmm0," MEMACCESS(1) " \n"
michael@0 1156 BUNDLEALIGN
michael@0 1157 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
michael@0 1158 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 1159 "jg 1b \n"
michael@0 1160 : "+r"(src_argb0), // %0
michael@0 1161 "+r"(dst_u), // %1
michael@0 1162 "+r"(dst_v), // %2
michael@0 1163 "+rm"(width) // %3
michael@0 1164 : "r"((intptr_t)(src_stride_argb))
michael@0 1165 : "memory", "cc"
michael@0 1166 #if defined(__native_client__) && defined(__x86_64__)
michael@0 1167 , "r14"
michael@0 1168 #endif
michael@0 1169 #if defined(__SSE2__)
michael@0 1170 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
michael@0 1171 #endif
michael@0 1172 );
michael@0 1173 }
michael@0 1174
michael@0 1175 void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
michael@0 1176 int width) {
michael@0 1177 asm volatile (
michael@0 1178 "movdqa %0,%%xmm4 \n"
michael@0 1179 "movdqa %1,%%xmm3 \n"
michael@0 1180 "movdqa %2,%%xmm5 \n"
michael@0 1181 :
michael@0 1182 : "m"(kARGBToU), // %0
michael@0 1183 "m"(kARGBToV), // %1
michael@0 1184 "m"(kAddUV128) // %2
michael@0 1185 );
michael@0 1186 asm volatile (
michael@0 1187 "sub %1,%2 \n"
michael@0 1188 LABELALIGN
michael@0 1189 "1: \n"
michael@0 1190 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1191 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1192 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1193 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
michael@0 1194 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 1195 "pmaddubsw %%xmm4,%%xmm1 \n"
michael@0 1196 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 1197 "pmaddubsw %%xmm4,%%xmm6 \n"
michael@0 1198 "phaddw %%xmm1,%%xmm0 \n"
michael@0 1199 "phaddw %%xmm6,%%xmm2 \n"
michael@0 1200 "psraw $0x8,%%xmm0 \n"
michael@0 1201 "psraw $0x8,%%xmm2 \n"
michael@0 1202 "packsswb %%xmm2,%%xmm0 \n"
michael@0 1203 "paddb %%xmm5,%%xmm0 \n"
michael@0 1204 "sub $0x10,%3 \n"
michael@0 1205 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 1206 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1207 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1208 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1209 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
michael@0 1210 "pmaddubsw %%xmm3,%%xmm0 \n"
michael@0 1211 "pmaddubsw %%xmm3,%%xmm1 \n"
michael@0 1212 "pmaddubsw %%xmm3,%%xmm2 \n"
michael@0 1213 "pmaddubsw %%xmm3,%%xmm6 \n"
michael@0 1214 "phaddw %%xmm1,%%xmm0 \n"
michael@0 1215 "phaddw %%xmm6,%%xmm2 \n"
michael@0 1216 "psraw $0x8,%%xmm0 \n"
michael@0 1217 "psraw $0x8,%%xmm2 \n"
michael@0 1218 "packsswb %%xmm2,%%xmm0 \n"
michael@0 1219 "paddb %%xmm5,%%xmm0 \n"
michael@0 1220 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 1221 BUNDLEALIGN
michael@0 1222 MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1)
michael@0 1223 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 1224 "jg 1b \n"
michael@0 1225 : "+r"(src_argb), // %0
michael@0 1226 "+r"(dst_u), // %1
michael@0 1227 "+r"(dst_v), // %2
michael@0 1228 "+rm"(width) // %3
michael@0 1229 :
michael@0 1230 : "memory", "cc"
michael@0 1231 #if defined(__native_client__) && defined(__x86_64__)
michael@0 1232 , "r14"
michael@0 1233 #endif
michael@0 1234 #if defined(__SSE2__)
michael@0 1235 , "xmm0", "xmm1", "xmm2", "xmm6"
michael@0 1236 #endif
michael@0 1237 );
michael@0 1238 }
michael@0 1239
michael@0 1240 void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
michael@0 1241 uint8* dst_v, int width) {
michael@0 1242 asm volatile (
michael@0 1243 "movdqa %0,%%xmm4 \n"
michael@0 1244 "movdqa %1,%%xmm3 \n"
michael@0 1245 "movdqa %2,%%xmm5 \n"
michael@0 1246 :
michael@0 1247 : "m"(kARGBToU), // %0
michael@0 1248 "m"(kARGBToV), // %1
michael@0 1249 "m"(kAddUV128) // %2
michael@0 1250 );
michael@0 1251 asm volatile (
michael@0 1252 "sub %1,%2 \n"
michael@0 1253 LABELALIGN
michael@0 1254 "1: \n"
michael@0 1255 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1256 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1257 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1258 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
michael@0 1259 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 1260 "pmaddubsw %%xmm4,%%xmm1 \n"
michael@0 1261 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 1262 "pmaddubsw %%xmm4,%%xmm6 \n"
michael@0 1263 "phaddw %%xmm1,%%xmm0 \n"
michael@0 1264 "phaddw %%xmm6,%%xmm2 \n"
michael@0 1265 "psraw $0x8,%%xmm0 \n"
michael@0 1266 "psraw $0x8,%%xmm2 \n"
michael@0 1267 "packsswb %%xmm2,%%xmm0 \n"
michael@0 1268 "paddb %%xmm5,%%xmm0 \n"
michael@0 1269 "sub $0x10,%3 \n"
michael@0 1270 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 1271 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1272 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1273 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1274 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
michael@0 1275 "pmaddubsw %%xmm3,%%xmm0 \n"
michael@0 1276 "pmaddubsw %%xmm3,%%xmm1 \n"
michael@0 1277 "pmaddubsw %%xmm3,%%xmm2 \n"
michael@0 1278 "pmaddubsw %%xmm3,%%xmm6 \n"
michael@0 1279 "phaddw %%xmm1,%%xmm0 \n"
michael@0 1280 "phaddw %%xmm6,%%xmm2 \n"
michael@0 1281 "psraw $0x8,%%xmm0 \n"
michael@0 1282 "psraw $0x8,%%xmm2 \n"
michael@0 1283 "packsswb %%xmm2,%%xmm0 \n"
michael@0 1284 "paddb %%xmm5,%%xmm0 \n"
michael@0 1285 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 1286 BUNDLEALIGN
michael@0 1287 MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1)
michael@0 1288 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 1289 "jg 1b \n"
michael@0 1290 : "+r"(src_argb), // %0
michael@0 1291 "+r"(dst_u), // %1
michael@0 1292 "+r"(dst_v), // %2
michael@0 1293 "+rm"(width) // %3
michael@0 1294 :
michael@0 1295 : "memory", "cc"
michael@0 1296 #if defined(__native_client__) && defined(__x86_64__)
michael@0 1297 , "r14"
michael@0 1298 #endif
michael@0 1299 #if defined(__SSE2__)
michael@0 1300 , "xmm0", "xmm1", "xmm2", "xmm6"
michael@0 1301 #endif
michael@0 1302 );
michael@0 1303 }
michael@0 1304
michael@0 1305 void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
michael@0 1306 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1307 asm volatile (
michael@0 1308 "movdqa %0,%%xmm4 \n"
michael@0 1309 "movdqa %1,%%xmm3 \n"
michael@0 1310 "movdqa %2,%%xmm5 \n"
michael@0 1311 :
michael@0 1312 : "m"(kARGBToU), // %0
michael@0 1313 "m"(kARGBToV), // %1
michael@0 1314 "m"(kAddUV128) // %2
michael@0 1315 );
michael@0 1316 asm volatile (
michael@0 1317 "sub %1,%2 \n"
michael@0 1318 LABELALIGN
michael@0 1319 "1: \n"
michael@0 1320 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1321 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1322 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1323 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
michael@0 1324 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 1325 "movdqa %%xmm0,%%xmm7 \n"
michael@0 1326 "shufps $0x88,%%xmm1,%%xmm0 \n"
michael@0 1327 "shufps $0xdd,%%xmm1,%%xmm7 \n"
michael@0 1328 "pavgb %%xmm7,%%xmm0 \n"
michael@0 1329 "movdqa %%xmm2,%%xmm7 \n"
michael@0 1330 "shufps $0x88,%%xmm6,%%xmm2 \n"
michael@0 1331 "shufps $0xdd,%%xmm6,%%xmm7 \n"
michael@0 1332 "pavgb %%xmm7,%%xmm2 \n"
michael@0 1333 "movdqa %%xmm0,%%xmm1 \n"
michael@0 1334 "movdqa %%xmm2,%%xmm6 \n"
michael@0 1335 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 1336 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 1337 "pmaddubsw %%xmm3,%%xmm1 \n"
michael@0 1338 "pmaddubsw %%xmm3,%%xmm6 \n"
michael@0 1339 "phaddw %%xmm2,%%xmm0 \n"
michael@0 1340 "phaddw %%xmm6,%%xmm1 \n"
michael@0 1341 "psraw $0x8,%%xmm0 \n"
michael@0 1342 "psraw $0x8,%%xmm1 \n"
michael@0 1343 "packsswb %%xmm1,%%xmm0 \n"
michael@0 1344 "paddb %%xmm5,%%xmm0 \n"
michael@0 1345 "sub $0x10,%3 \n"
michael@0 1346 "movlps %%xmm0," MEMACCESS(1) " \n"
michael@0 1347 BUNDLEALIGN
michael@0 1348 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
michael@0 1349 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 1350 "jg 1b \n"
michael@0 1351 : "+r"(src_argb0), // %0
michael@0 1352 "+r"(dst_u), // %1
michael@0 1353 "+r"(dst_v), // %2
michael@0 1354 "+rm"(width) // %3
michael@0 1355 :
michael@0 1356 : "memory", "cc"
michael@0 1357 #if defined(__native_client__) && defined(__x86_64__)
michael@0 1358 , "r14"
michael@0 1359 #endif
michael@0 1360 #if defined(__SSE2__)
michael@0 1361 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
michael@0 1362 #endif
michael@0 1363 );
michael@0 1364 }
michael@0 1365
michael@0 1366 void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
michael@0 1367 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1368 asm volatile (
michael@0 1369 "movdqa %0,%%xmm4 \n"
michael@0 1370 "movdqa %1,%%xmm3 \n"
michael@0 1371 "movdqa %2,%%xmm5 \n"
michael@0 1372 :
michael@0 1373 : "m"(kARGBToU), // %0
michael@0 1374 "m"(kARGBToV), // %1
michael@0 1375 "m"(kAddUV128) // %2
michael@0 1376 );
michael@0 1377 asm volatile (
michael@0 1378 "sub %1,%2 \n"
michael@0 1379 LABELALIGN
michael@0 1380 "1: \n"
michael@0 1381 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1382 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1383 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1384 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
michael@0 1385 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 1386 "movdqa %%xmm0,%%xmm7 \n"
michael@0 1387 "shufps $0x88,%%xmm1,%%xmm0 \n"
michael@0 1388 "shufps $0xdd,%%xmm1,%%xmm7 \n"
michael@0 1389 "pavgb %%xmm7,%%xmm0 \n"
michael@0 1390 "movdqa %%xmm2,%%xmm7 \n"
michael@0 1391 "shufps $0x88,%%xmm6,%%xmm2 \n"
michael@0 1392 "shufps $0xdd,%%xmm6,%%xmm7 \n"
michael@0 1393 "pavgb %%xmm7,%%xmm2 \n"
michael@0 1394 "movdqa %%xmm0,%%xmm1 \n"
michael@0 1395 "movdqa %%xmm2,%%xmm6 \n"
michael@0 1396 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 1397 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 1398 "pmaddubsw %%xmm3,%%xmm1 \n"
michael@0 1399 "pmaddubsw %%xmm3,%%xmm6 \n"
michael@0 1400 "phaddw %%xmm2,%%xmm0 \n"
michael@0 1401 "phaddw %%xmm6,%%xmm1 \n"
michael@0 1402 "psraw $0x8,%%xmm0 \n"
michael@0 1403 "psraw $0x8,%%xmm1 \n"
michael@0 1404 "packsswb %%xmm1,%%xmm0 \n"
michael@0 1405 "paddb %%xmm5,%%xmm0 \n"
michael@0 1406 "sub $0x10,%3 \n"
michael@0 1407 "movlps %%xmm0," MEMACCESS(1) " \n"
michael@0 1408 BUNDLEALIGN
michael@0 1409 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
michael@0 1410 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 1411 "jg 1b \n"
michael@0 1412 : "+r"(src_argb0), // %0
michael@0 1413 "+r"(dst_u), // %1
michael@0 1414 "+r"(dst_v), // %2
michael@0 1415 "+rm"(width) // %3
michael@0 1416 :
michael@0 1417 : "memory", "cc"
michael@0 1418 #if defined(__native_client__) && defined(__x86_64__)
michael@0 1419 , "r14"
michael@0 1420 #endif
michael@0 1421 #if defined(__SSE2__)
michael@0 1422 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
michael@0 1423 #endif
michael@0 1424 );
michael@0 1425 }
michael@0 1426
michael@0 1427 void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
michael@0 1428 asm volatile (
michael@0 1429 "movdqa %4,%%xmm5 \n"
michael@0 1430 "movdqa %3,%%xmm4 \n"
michael@0 1431 LABELALIGN
michael@0 1432 "1: \n"
michael@0 1433 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1434 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1435 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1436 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
michael@0 1437 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 1438 "pmaddubsw %%xmm4,%%xmm1 \n"
michael@0 1439 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 1440 "pmaddubsw %%xmm4,%%xmm3 \n"
michael@0 1441 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 1442 "phaddw %%xmm1,%%xmm0 \n"
michael@0 1443 "phaddw %%xmm3,%%xmm2 \n"
michael@0 1444 "psrlw $0x7,%%xmm0 \n"
michael@0 1445 "psrlw $0x7,%%xmm2 \n"
michael@0 1446 "packuswb %%xmm2,%%xmm0 \n"
michael@0 1447 "paddb %%xmm5,%%xmm0 \n"
michael@0 1448 "sub $0x10,%2 \n"
michael@0 1449 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 1450 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 1451 "jg 1b \n"
michael@0 1452 : "+r"(src_bgra), // %0
michael@0 1453 "+r"(dst_y), // %1
michael@0 1454 "+r"(pix) // %2
michael@0 1455 : "m"(kBGRAToY), // %3
michael@0 1456 "m"(kAddY16) // %4
michael@0 1457 : "memory", "cc"
michael@0 1458 #if defined(__SSE2__)
michael@0 1459 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 1460 #endif
michael@0 1461 );
michael@0 1462 }
michael@0 1463
michael@0 1464 void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
michael@0 1465 asm volatile (
michael@0 1466 "movdqa %4,%%xmm5 \n"
michael@0 1467 "movdqa %3,%%xmm4 \n"
michael@0 1468 LABELALIGN
michael@0 1469 "1: \n"
michael@0 1470 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1471 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1472 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1473 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
michael@0 1474 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 1475 "pmaddubsw %%xmm4,%%xmm1 \n"
michael@0 1476 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 1477 "pmaddubsw %%xmm4,%%xmm3 \n"
michael@0 1478 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 1479 "phaddw %%xmm1,%%xmm0 \n"
michael@0 1480 "phaddw %%xmm3,%%xmm2 \n"
michael@0 1481 "psrlw $0x7,%%xmm0 \n"
michael@0 1482 "psrlw $0x7,%%xmm2 \n"
michael@0 1483 "packuswb %%xmm2,%%xmm0 \n"
michael@0 1484 "paddb %%xmm5,%%xmm0 \n"
michael@0 1485 "sub $0x10,%2 \n"
michael@0 1486 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 1487 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 1488 "jg 1b \n"
michael@0 1489 : "+r"(src_bgra), // %0
michael@0 1490 "+r"(dst_y), // %1
michael@0 1491 "+r"(pix) // %2
michael@0 1492 : "m"(kBGRAToY), // %3
michael@0 1493 "m"(kAddY16) // %4
michael@0 1494 : "memory", "cc"
michael@0 1495 #if defined(__SSE2__)
michael@0 1496 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 1497 #endif
michael@0 1498 );
michael@0 1499 }
michael@0 1500
michael@0 1501 void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
michael@0 1502 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1503 asm volatile (
michael@0 1504 "movdqa %0,%%xmm4 \n"
michael@0 1505 "movdqa %1,%%xmm3 \n"
michael@0 1506 "movdqa %2,%%xmm5 \n"
michael@0 1507 :
michael@0 1508 : "m"(kBGRAToU), // %0
michael@0 1509 "m"(kBGRAToV), // %1
michael@0 1510 "m"(kAddUV128) // %2
michael@0 1511 );
michael@0 1512 asm volatile (
michael@0 1513 "sub %1,%2 \n"
michael@0 1514 LABELALIGN
michael@0 1515 "1: \n"
michael@0 1516 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1517 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1518 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1519 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
michael@0 1520 BUNDLEALIGN
michael@0 1521 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
michael@0 1522 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
michael@0 1523 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
michael@0 1524 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
michael@0 1525 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 1526 "movdqa %%xmm0,%%xmm7 \n"
michael@0 1527 "shufps $0x88,%%xmm1,%%xmm0 \n"
michael@0 1528 "shufps $0xdd,%%xmm1,%%xmm7 \n"
michael@0 1529 "pavgb %%xmm7,%%xmm0 \n"
michael@0 1530 "movdqa %%xmm2,%%xmm7 \n"
michael@0 1531 "shufps $0x88,%%xmm6,%%xmm2 \n"
michael@0 1532 "shufps $0xdd,%%xmm6,%%xmm7 \n"
michael@0 1533 "pavgb %%xmm7,%%xmm2 \n"
michael@0 1534 "movdqa %%xmm0,%%xmm1 \n"
michael@0 1535 "movdqa %%xmm2,%%xmm6 \n"
michael@0 1536 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 1537 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 1538 "pmaddubsw %%xmm3,%%xmm1 \n"
michael@0 1539 "pmaddubsw %%xmm3,%%xmm6 \n"
michael@0 1540 "phaddw %%xmm2,%%xmm0 \n"
michael@0 1541 "phaddw %%xmm6,%%xmm1 \n"
michael@0 1542 "psraw $0x8,%%xmm0 \n"
michael@0 1543 "psraw $0x8,%%xmm1 \n"
michael@0 1544 "packsswb %%xmm1,%%xmm0 \n"
michael@0 1545 "paddb %%xmm5,%%xmm0 \n"
michael@0 1546 "sub $0x10,%3 \n"
michael@0 1547 "movlps %%xmm0," MEMACCESS(1) " \n"
michael@0 1548 BUNDLEALIGN
michael@0 1549 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
michael@0 1550 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 1551 "jg 1b \n"
michael@0 1552 : "+r"(src_bgra0), // %0
michael@0 1553 "+r"(dst_u), // %1
michael@0 1554 "+r"(dst_v), // %2
michael@0 1555 "+rm"(width) // %3
michael@0 1556 : "r"((intptr_t)(src_stride_bgra)) // %4
michael@0 1557 : "memory", "cc"
michael@0 1558 #if defined(__native_client__) && defined(__x86_64__)
michael@0 1559 , "r14"
michael@0 1560 #endif
michael@0 1561 #if defined(__SSE2__)
michael@0 1562 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
michael@0 1563 #endif
michael@0 1564 );
michael@0 1565 }
michael@0 1566
michael@0 1567 void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
michael@0 1568 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1569 asm volatile (
michael@0 1570 "movdqa %0,%%xmm4 \n"
michael@0 1571 "movdqa %1,%%xmm3 \n"
michael@0 1572 "movdqa %2,%%xmm5 \n"
michael@0 1573 :
michael@0 1574 : "m"(kBGRAToU), // %0
michael@0 1575 "m"(kBGRAToV), // %1
michael@0 1576 "m"(kAddUV128) // %2
michael@0 1577 );
michael@0 1578 asm volatile (
michael@0 1579 "sub %1,%2 \n"
michael@0 1580 LABELALIGN
michael@0 1581 "1: \n"
michael@0 1582 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1583 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1584 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1585 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
michael@0 1586 BUNDLEALIGN
michael@0 1587 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
michael@0 1588 "pavgb %%xmm7,%%xmm0 \n"
michael@0 1589 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
michael@0 1590 "pavgb %%xmm7,%%xmm1 \n"
michael@0 1591 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
michael@0 1592 "pavgb %%xmm7,%%xmm2 \n"
michael@0 1593 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
michael@0 1594 "pavgb %%xmm7,%%xmm6 \n"
michael@0 1595 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 1596 "movdqa %%xmm0,%%xmm7 \n"
michael@0 1597 "shufps $0x88,%%xmm1,%%xmm0 \n"
michael@0 1598 "shufps $0xdd,%%xmm1,%%xmm7 \n"
michael@0 1599 "pavgb %%xmm7,%%xmm0 \n"
michael@0 1600 "movdqa %%xmm2,%%xmm7 \n"
michael@0 1601 "shufps $0x88,%%xmm6,%%xmm2 \n"
michael@0 1602 "shufps $0xdd,%%xmm6,%%xmm7 \n"
michael@0 1603 "pavgb %%xmm7,%%xmm2 \n"
michael@0 1604 "movdqa %%xmm0,%%xmm1 \n"
michael@0 1605 "movdqa %%xmm2,%%xmm6 \n"
michael@0 1606 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 1607 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 1608 "pmaddubsw %%xmm3,%%xmm1 \n"
michael@0 1609 "pmaddubsw %%xmm3,%%xmm6 \n"
michael@0 1610 "phaddw %%xmm2,%%xmm0 \n"
michael@0 1611 "phaddw %%xmm6,%%xmm1 \n"
michael@0 1612 "psraw $0x8,%%xmm0 \n"
michael@0 1613 "psraw $0x8,%%xmm1 \n"
michael@0 1614 "packsswb %%xmm1,%%xmm0 \n"
michael@0 1615 "paddb %%xmm5,%%xmm0 \n"
michael@0 1616 "sub $0x10,%3 \n"
michael@0 1617 "movlps %%xmm0," MEMACCESS(1) " \n"
michael@0 1618 BUNDLEALIGN
michael@0 1619 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
michael@0 1620 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 1621 "jg 1b \n"
michael@0 1622 : "+r"(src_bgra0), // %0
michael@0 1623 "+r"(dst_u), // %1
michael@0 1624 "+r"(dst_v), // %2
michael@0 1625 "+rm"(width) // %3
michael@0 1626 : "r"((intptr_t)(src_stride_bgra)) // %4
michael@0 1627 : "memory", "cc"
michael@0 1628 #if defined(__native_client__) && defined(__x86_64__)
michael@0 1629 , "r14"
michael@0 1630 #endif
michael@0 1631 #if defined(__SSE2__)
michael@0 1632 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
michael@0 1633 #endif
michael@0 1634 );
michael@0 1635 }
michael@0 1636
michael@0 1637 void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
michael@0 1638 asm volatile (
michael@0 1639 "movdqa %4,%%xmm5 \n"
michael@0 1640 "movdqa %3,%%xmm4 \n"
michael@0 1641 LABELALIGN
michael@0 1642 "1: \n"
michael@0 1643 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1644 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1645 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1646 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
michael@0 1647 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 1648 "pmaddubsw %%xmm4,%%xmm1 \n"
michael@0 1649 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 1650 "pmaddubsw %%xmm4,%%xmm3 \n"
michael@0 1651 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 1652 "phaddw %%xmm1,%%xmm0 \n"
michael@0 1653 "phaddw %%xmm3,%%xmm2 \n"
michael@0 1654 "psrlw $0x7,%%xmm0 \n"
michael@0 1655 "psrlw $0x7,%%xmm2 \n"
michael@0 1656 "packuswb %%xmm2,%%xmm0 \n"
michael@0 1657 "paddb %%xmm5,%%xmm0 \n"
michael@0 1658 "sub $0x10,%2 \n"
michael@0 1659 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 1660 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 1661 "jg 1b \n"
michael@0 1662 : "+r"(src_abgr), // %0
michael@0 1663 "+r"(dst_y), // %1
michael@0 1664 "+r"(pix) // %2
michael@0 1665 : "m"(kABGRToY), // %3
michael@0 1666 "m"(kAddY16) // %4
michael@0 1667 : "memory", "cc"
michael@0 1668 #if defined(__SSE2__)
michael@0 1669 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 1670 #endif
michael@0 1671 );
michael@0 1672 }
michael@0 1673
michael@0 1674 void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
michael@0 1675 asm volatile (
michael@0 1676 "movdqa %4,%%xmm5 \n"
michael@0 1677 "movdqa %3,%%xmm4 \n"
michael@0 1678 LABELALIGN
michael@0 1679 "1: \n"
michael@0 1680 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1681 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1682 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1683 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
michael@0 1684 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 1685 "pmaddubsw %%xmm4,%%xmm1 \n"
michael@0 1686 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 1687 "pmaddubsw %%xmm4,%%xmm3 \n"
michael@0 1688 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 1689 "phaddw %%xmm1,%%xmm0 \n"
michael@0 1690 "phaddw %%xmm3,%%xmm2 \n"
michael@0 1691 "psrlw $0x7,%%xmm0 \n"
michael@0 1692 "psrlw $0x7,%%xmm2 \n"
michael@0 1693 "packuswb %%xmm2,%%xmm0 \n"
michael@0 1694 "paddb %%xmm5,%%xmm0 \n"
michael@0 1695 "sub $0x10,%2 \n"
michael@0 1696 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 1697 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 1698 "jg 1b \n"
michael@0 1699 : "+r"(src_abgr), // %0
michael@0 1700 "+r"(dst_y), // %1
michael@0 1701 "+r"(pix) // %2
michael@0 1702 : "m"(kABGRToY), // %3
michael@0 1703 "m"(kAddY16) // %4
michael@0 1704 : "memory", "cc"
michael@0 1705 #if defined(__SSE2__)
michael@0 1706 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 1707 #endif
michael@0 1708 );
michael@0 1709 }
michael@0 1710
michael@0 1711 void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
michael@0 1712 asm volatile (
michael@0 1713 "movdqa %4,%%xmm5 \n"
michael@0 1714 "movdqa %3,%%xmm4 \n"
michael@0 1715 LABELALIGN
michael@0 1716 "1: \n"
michael@0 1717 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1718 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1719 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1720 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
michael@0 1721 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 1722 "pmaddubsw %%xmm4,%%xmm1 \n"
michael@0 1723 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 1724 "pmaddubsw %%xmm4,%%xmm3 \n"
michael@0 1725 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 1726 "phaddw %%xmm1,%%xmm0 \n"
michael@0 1727 "phaddw %%xmm3,%%xmm2 \n"
michael@0 1728 "psrlw $0x7,%%xmm0 \n"
michael@0 1729 "psrlw $0x7,%%xmm2 \n"
michael@0 1730 "packuswb %%xmm2,%%xmm0 \n"
michael@0 1731 "paddb %%xmm5,%%xmm0 \n"
michael@0 1732 "sub $0x10,%2 \n"
michael@0 1733 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 1734 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 1735 "jg 1b \n"
michael@0 1736 : "+r"(src_rgba), // %0
michael@0 1737 "+r"(dst_y), // %1
michael@0 1738 "+r"(pix) // %2
michael@0 1739 : "m"(kRGBAToY), // %3
michael@0 1740 "m"(kAddY16) // %4
michael@0 1741 : "memory", "cc"
michael@0 1742 #if defined(__SSE2__)
michael@0 1743 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 1744 #endif
michael@0 1745 );
michael@0 1746 }
michael@0 1747
michael@0 1748 void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
michael@0 1749 asm volatile (
michael@0 1750 "movdqa %4,%%xmm5 \n"
michael@0 1751 "movdqa %3,%%xmm4 \n"
michael@0 1752 LABELALIGN
michael@0 1753 "1: \n"
michael@0 1754 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1755 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1756 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1757 "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n"
michael@0 1758 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 1759 "pmaddubsw %%xmm4,%%xmm1 \n"
michael@0 1760 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 1761 "pmaddubsw %%xmm4,%%xmm3 \n"
michael@0 1762 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 1763 "phaddw %%xmm1,%%xmm0 \n"
michael@0 1764 "phaddw %%xmm3,%%xmm2 \n"
michael@0 1765 "psrlw $0x7,%%xmm0 \n"
michael@0 1766 "psrlw $0x7,%%xmm2 \n"
michael@0 1767 "packuswb %%xmm2,%%xmm0 \n"
michael@0 1768 "paddb %%xmm5,%%xmm0 \n"
michael@0 1769 "sub $0x10,%2 \n"
michael@0 1770 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 1771 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 1772 "jg 1b \n"
michael@0 1773 : "+r"(src_rgba), // %0
michael@0 1774 "+r"(dst_y), // %1
michael@0 1775 "+r"(pix) // %2
michael@0 1776 : "m"(kRGBAToY), // %3
michael@0 1777 "m"(kAddY16) // %4
michael@0 1778 : "memory", "cc"
michael@0 1779 #if defined(__SSE2__)
michael@0 1780 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 1781 #endif
michael@0 1782 );
michael@0 1783 }
michael@0 1784
michael@0 1785 void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
michael@0 1786 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1787 asm volatile (
michael@0 1788 "movdqa %0,%%xmm4 \n"
michael@0 1789 "movdqa %1,%%xmm3 \n"
michael@0 1790 "movdqa %2,%%xmm5 \n"
michael@0 1791 :
michael@0 1792 : "m"(kABGRToU), // %0
michael@0 1793 "m"(kABGRToV), // %1
michael@0 1794 "m"(kAddUV128) // %2
michael@0 1795 );
michael@0 1796 asm volatile (
michael@0 1797 "sub %1,%2 \n"
michael@0 1798 LABELALIGN
michael@0 1799 "1: \n"
michael@0 1800 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1801 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1802 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1803 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
michael@0 1804 BUNDLEALIGN
michael@0 1805 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
michael@0 1806 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
michael@0 1807 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
michael@0 1808 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
michael@0 1809 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 1810 "movdqa %%xmm0,%%xmm7 \n"
michael@0 1811 "shufps $0x88,%%xmm1,%%xmm0 \n"
michael@0 1812 "shufps $0xdd,%%xmm1,%%xmm7 \n"
michael@0 1813 "pavgb %%xmm7,%%xmm0 \n"
michael@0 1814 "movdqa %%xmm2,%%xmm7 \n"
michael@0 1815 "shufps $0x88,%%xmm6,%%xmm2 \n"
michael@0 1816 "shufps $0xdd,%%xmm6,%%xmm7 \n"
michael@0 1817 "pavgb %%xmm7,%%xmm2 \n"
michael@0 1818 "movdqa %%xmm0,%%xmm1 \n"
michael@0 1819 "movdqa %%xmm2,%%xmm6 \n"
michael@0 1820 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 1821 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 1822 "pmaddubsw %%xmm3,%%xmm1 \n"
michael@0 1823 "pmaddubsw %%xmm3,%%xmm6 \n"
michael@0 1824 "phaddw %%xmm2,%%xmm0 \n"
michael@0 1825 "phaddw %%xmm6,%%xmm1 \n"
michael@0 1826 "psraw $0x8,%%xmm0 \n"
michael@0 1827 "psraw $0x8,%%xmm1 \n"
michael@0 1828 "packsswb %%xmm1,%%xmm0 \n"
michael@0 1829 "paddb %%xmm5,%%xmm0 \n"
michael@0 1830 "sub $0x10,%3 \n"
michael@0 1831 "movlps %%xmm0," MEMACCESS(1) " \n"
michael@0 1832 BUNDLEALIGN
michael@0 1833 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
michael@0 1834 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 1835 "jg 1b \n"
michael@0 1836 : "+r"(src_abgr0), // %0
michael@0 1837 "+r"(dst_u), // %1
michael@0 1838 "+r"(dst_v), // %2
michael@0 1839 "+rm"(width) // %3
michael@0 1840 : "r"((intptr_t)(src_stride_abgr)) // %4
michael@0 1841 : "memory", "cc"
michael@0 1842 #if defined(__native_client__) && defined(__x86_64__)
michael@0 1843 , "r14"
michael@0 1844 #endif
michael@0 1845 #if defined(__SSE2__)
michael@0 1846 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
michael@0 1847 #endif
michael@0 1848 );
michael@0 1849 }
michael@0 1850
michael@0 1851 void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
michael@0 1852 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1853 asm volatile (
michael@0 1854 "movdqa %0,%%xmm4 \n"
michael@0 1855 "movdqa %1,%%xmm3 \n"
michael@0 1856 "movdqa %2,%%xmm5 \n"
michael@0 1857 :
michael@0 1858 : "m"(kABGRToU), // %0
michael@0 1859 "m"(kABGRToV), // %1
michael@0 1860 "m"(kAddUV128) // %2
michael@0 1861 );
michael@0 1862 asm volatile (
michael@0 1863 "sub %1,%2 \n"
michael@0 1864 LABELALIGN
michael@0 1865 "1: \n"
michael@0 1866 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1867 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1868 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1869 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
michael@0 1870 BUNDLEALIGN
michael@0 1871 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
michael@0 1872 "pavgb %%xmm7,%%xmm0 \n"
michael@0 1873 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
michael@0 1874 "pavgb %%xmm7,%%xmm1 \n"
michael@0 1875 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
michael@0 1876 "pavgb %%xmm7,%%xmm2 \n"
michael@0 1877 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
michael@0 1878 "pavgb %%xmm7,%%xmm6 \n"
michael@0 1879 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 1880 "movdqa %%xmm0,%%xmm7 \n"
michael@0 1881 "shufps $0x88,%%xmm1,%%xmm0 \n"
michael@0 1882 "shufps $0xdd,%%xmm1,%%xmm7 \n"
michael@0 1883 "pavgb %%xmm7,%%xmm0 \n"
michael@0 1884 "movdqa %%xmm2,%%xmm7 \n"
michael@0 1885 "shufps $0x88,%%xmm6,%%xmm2 \n"
michael@0 1886 "shufps $0xdd,%%xmm6,%%xmm7 \n"
michael@0 1887 "pavgb %%xmm7,%%xmm2 \n"
michael@0 1888 "movdqa %%xmm0,%%xmm1 \n"
michael@0 1889 "movdqa %%xmm2,%%xmm6 \n"
michael@0 1890 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 1891 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 1892 "pmaddubsw %%xmm3,%%xmm1 \n"
michael@0 1893 "pmaddubsw %%xmm3,%%xmm6 \n"
michael@0 1894 "phaddw %%xmm2,%%xmm0 \n"
michael@0 1895 "phaddw %%xmm6,%%xmm1 \n"
michael@0 1896 "psraw $0x8,%%xmm0 \n"
michael@0 1897 "psraw $0x8,%%xmm1 \n"
michael@0 1898 "packsswb %%xmm1,%%xmm0 \n"
michael@0 1899 "paddb %%xmm5,%%xmm0 \n"
michael@0 1900 "sub $0x10,%3 \n"
michael@0 1901 "movlps %%xmm0," MEMACCESS(1) " \n"
michael@0 1902 BUNDLEALIGN
michael@0 1903 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
michael@0 1904 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 1905 "jg 1b \n"
michael@0 1906 : "+r"(src_abgr0), // %0
michael@0 1907 "+r"(dst_u), // %1
michael@0 1908 "+r"(dst_v), // %2
michael@0 1909 "+rm"(width) // %3
michael@0 1910 : "r"((intptr_t)(src_stride_abgr)) // %4
michael@0 1911 : "memory", "cc"
michael@0 1912 #if defined(__native_client__) && defined(__x86_64__)
michael@0 1913 , "r14"
michael@0 1914 #endif
michael@0 1915 #if defined(__SSE2__)
michael@0 1916 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
michael@0 1917 #endif
michael@0 1918 );
michael@0 1919 }
michael@0 1920
michael@0 1921 void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
michael@0 1922 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1923 asm volatile (
michael@0 1924 "movdqa %0,%%xmm4 \n"
michael@0 1925 "movdqa %1,%%xmm3 \n"
michael@0 1926 "movdqa %2,%%xmm5 \n"
michael@0 1927 :
michael@0 1928 : "m"(kRGBAToU), // %0
michael@0 1929 "m"(kRGBAToV), // %1
michael@0 1930 "m"(kAddUV128) // %2
michael@0 1931 );
michael@0 1932 asm volatile (
michael@0 1933 "sub %1,%2 \n"
michael@0 1934 LABELALIGN
michael@0 1935 "1: \n"
michael@0 1936 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1937 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 1938 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 1939 "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n"
michael@0 1940 BUNDLEALIGN
michael@0 1941 MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0
michael@0 1942 MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1
michael@0 1943 MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2
michael@0 1944 MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6
michael@0 1945 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 1946 "movdqa %%xmm0,%%xmm7 \n"
michael@0 1947 "shufps $0x88,%%xmm1,%%xmm0 \n"
michael@0 1948 "shufps $0xdd,%%xmm1,%%xmm7 \n"
michael@0 1949 "pavgb %%xmm7,%%xmm0 \n"
michael@0 1950 "movdqa %%xmm2,%%xmm7 \n"
michael@0 1951 "shufps $0x88,%%xmm6,%%xmm2 \n"
michael@0 1952 "shufps $0xdd,%%xmm6,%%xmm7 \n"
michael@0 1953 "pavgb %%xmm7,%%xmm2 \n"
michael@0 1954 "movdqa %%xmm0,%%xmm1 \n"
michael@0 1955 "movdqa %%xmm2,%%xmm6 \n"
michael@0 1956 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 1957 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 1958 "pmaddubsw %%xmm3,%%xmm1 \n"
michael@0 1959 "pmaddubsw %%xmm3,%%xmm6 \n"
michael@0 1960 "phaddw %%xmm2,%%xmm0 \n"
michael@0 1961 "phaddw %%xmm6,%%xmm1 \n"
michael@0 1962 "psraw $0x8,%%xmm0 \n"
michael@0 1963 "psraw $0x8,%%xmm1 \n"
michael@0 1964 "packsswb %%xmm1,%%xmm0 \n"
michael@0 1965 "paddb %%xmm5,%%xmm0 \n"
michael@0 1966 "sub $0x10,%3 \n"
michael@0 1967 "movlps %%xmm0," MEMACCESS(1) " \n"
michael@0 1968 BUNDLEALIGN
michael@0 1969 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
michael@0 1970 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 1971 "jg 1b \n"
michael@0 1972 : "+r"(src_rgba0), // %0
michael@0 1973 "+r"(dst_u), // %1
michael@0 1974 "+r"(dst_v), // %2
michael@0 1975 "+rm"(width) // %3
michael@0 1976 : "r"((intptr_t)(src_stride_rgba))
michael@0 1977 : "memory", "cc"
michael@0 1978 #if defined(__native_client__) && defined(__x86_64__)
michael@0 1979 , "r14"
michael@0 1980 #endif
michael@0 1981 #if defined(__SSE2__)
michael@0 1982 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
michael@0 1983 #endif
michael@0 1984 );
michael@0 1985 }
michael@0 1986
michael@0 1987 void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
michael@0 1988 uint8* dst_u, uint8* dst_v, int width) {
michael@0 1989 asm volatile (
michael@0 1990 "movdqa %0,%%xmm4 \n"
michael@0 1991 "movdqa %1,%%xmm3 \n"
michael@0 1992 "movdqa %2,%%xmm5 \n"
michael@0 1993 :
michael@0 1994 : "m"(kRGBAToU), // %0
michael@0 1995 "m"(kRGBAToV), // %1
michael@0 1996 "m"(kAddUV128) // %2
michael@0 1997 );
michael@0 1998 asm volatile (
michael@0 1999 "sub %1,%2 \n"
michael@0 2000 LABELALIGN
michael@0 2001 "1: \n"
michael@0 2002 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 2003 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 2004 "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 2005 "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n"
michael@0 2006 BUNDLEALIGN
michael@0 2007 MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7
michael@0 2008 "pavgb %%xmm7,%%xmm0 \n"
michael@0 2009 MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7
michael@0 2010 "pavgb %%xmm7,%%xmm1 \n"
michael@0 2011 MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7
michael@0 2012 "pavgb %%xmm7,%%xmm2 \n"
michael@0 2013 MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7
michael@0 2014 "pavgb %%xmm7,%%xmm6 \n"
michael@0 2015 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 2016 "movdqa %%xmm0,%%xmm7 \n"
michael@0 2017 "shufps $0x88,%%xmm1,%%xmm0 \n"
michael@0 2018 "shufps $0xdd,%%xmm1,%%xmm7 \n"
michael@0 2019 "pavgb %%xmm7,%%xmm0 \n"
michael@0 2020 "movdqa %%xmm2,%%xmm7 \n"
michael@0 2021 "shufps $0x88,%%xmm6,%%xmm2 \n"
michael@0 2022 "shufps $0xdd,%%xmm6,%%xmm7 \n"
michael@0 2023 "pavgb %%xmm7,%%xmm2 \n"
michael@0 2024 "movdqa %%xmm0,%%xmm1 \n"
michael@0 2025 "movdqa %%xmm2,%%xmm6 \n"
michael@0 2026 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 2027 "pmaddubsw %%xmm4,%%xmm2 \n"
michael@0 2028 "pmaddubsw %%xmm3,%%xmm1 \n"
michael@0 2029 "pmaddubsw %%xmm3,%%xmm6 \n"
michael@0 2030 "phaddw %%xmm2,%%xmm0 \n"
michael@0 2031 "phaddw %%xmm6,%%xmm1 \n"
michael@0 2032 "psraw $0x8,%%xmm0 \n"
michael@0 2033 "psraw $0x8,%%xmm1 \n"
michael@0 2034 "packsswb %%xmm1,%%xmm0 \n"
michael@0 2035 "paddb %%xmm5,%%xmm0 \n"
michael@0 2036 "sub $0x10,%3 \n"
michael@0 2037 "movlps %%xmm0," MEMACCESS(1) " \n"
michael@0 2038 BUNDLEALIGN
michael@0 2039 MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1)
michael@0 2040 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 2041 "jg 1b \n"
michael@0 2042 : "+r"(src_rgba0), // %0
michael@0 2043 "+r"(dst_u), // %1
michael@0 2044 "+r"(dst_v), // %2
michael@0 2045 "+rm"(width) // %3
michael@0 2046 : "r"((intptr_t)(src_stride_rgba)) // %4
michael@0 2047 : "memory", "cc"
michael@0 2048 #if defined(__native_client__) && defined(__x86_64__)
michael@0 2049 , "r14"
michael@0 2050 #endif
michael@0 2051 #if defined(__SSE2__)
michael@0 2052 , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
michael@0 2053 #endif
michael@0 2054 );
michael@0 2055 }
michael@0 2056 #endif // HAS_ARGBTOUVROW_SSSE3
michael@0 2057
michael@0 2058 #ifdef HAS_I422TOARGBROW_SSSE3
michael@0 2059 #define UB 127 /* min(63,(int8)(2.018 * 64)) */
michael@0 2060 #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
michael@0 2061 #define UR 0
michael@0 2062
michael@0 2063 #define VB 0
michael@0 2064 #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
michael@0 2065 #define VR 102 /* (int8)(1.596 * 64 + 0.5) */
michael@0 2066
michael@0 2067 // Bias
michael@0 2068 #define BB UB * 128 + VB * 128
michael@0 2069 #define BG UG * 128 + VG * 128
michael@0 2070 #define BR UR * 128 + VR * 128
michael@0 2071
michael@0 2072 #define YG 74 /* (int8)(1.164 * 64 + 0.5) */
michael@0 2073
michael@0 2074 struct {
michael@0 2075 vec8 kUVToB; // 0
michael@0 2076 vec8 kUVToG; // 16
michael@0 2077 vec8 kUVToR; // 32
michael@0 2078 vec16 kUVBiasB; // 48
michael@0 2079 vec16 kUVBiasG; // 64
michael@0 2080 vec16 kUVBiasR; // 80
michael@0 2081 vec16 kYSub16; // 96
michael@0 2082 vec16 kYToRgb; // 112
michael@0 2083 vec8 kVUToB; // 128
michael@0 2084 vec8 kVUToG; // 144
michael@0 2085 vec8 kVUToR; // 160
michael@0 2086 } static SIMD_ALIGNED(kYuvConstants) = {
michael@0 2087 { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
michael@0 2088 { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
michael@0 2089 { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
michael@0 2090 { BB, BB, BB, BB, BB, BB, BB, BB },
michael@0 2091 { BG, BG, BG, BG, BG, BG, BG, BG },
michael@0 2092 { BR, BR, BR, BR, BR, BR, BR, BR },
michael@0 2093 { 16, 16, 16, 16, 16, 16, 16, 16 },
michael@0 2094 { YG, YG, YG, YG, YG, YG, YG, YG },
michael@0 2095 { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
michael@0 2096 { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
michael@0 2097 { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
michael@0 2098 };
michael@0 2099
michael@0 2100
michael@0 2101 // Read 8 UV from 411
michael@0 2102 #define READYUV444 \
michael@0 2103 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
michael@0 2104 BUNDLEALIGN \
michael@0 2105 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
michael@0 2106 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
michael@0 2107 "punpcklbw %%xmm1,%%xmm0 \n"
michael@0 2108
michael@0 2109 // Read 4 UV from 422, upsample to 8 UV
michael@0 2110 #define READYUV422 \
michael@0 2111 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
michael@0 2112 BUNDLEALIGN \
michael@0 2113 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
michael@0 2114 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \
michael@0 2115 "punpcklbw %%xmm1,%%xmm0 \n" \
michael@0 2116 "punpcklwd %%xmm0,%%xmm0 \n"
michael@0 2117
michael@0 2118 // Read 2 UV from 411, upsample to 8 UV
michael@0 2119 #define READYUV411 \
michael@0 2120 "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
michael@0 2121 BUNDLEALIGN \
michael@0 2122 MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \
michael@0 2123 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
michael@0 2124 "punpcklbw %%xmm1,%%xmm0 \n" \
michael@0 2125 "punpcklwd %%xmm0,%%xmm0 \n" \
michael@0 2126 "punpckldq %%xmm0,%%xmm0 \n"
michael@0 2127
michael@0 2128 // Read 4 UV from NV12, upsample to 8 UV
michael@0 2129 #define READNV12 \
michael@0 2130 "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \
michael@0 2131 "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \
michael@0 2132 "punpcklwd %%xmm0,%%xmm0 \n"
michael@0 2133
michael@0 2134 // Convert 8 pixels: 8 UV and 8 Y
michael@0 2135 #define YUVTORGB \
michael@0 2136 "movdqa %%xmm0,%%xmm1 \n" \
michael@0 2137 "movdqa %%xmm0,%%xmm2 \n" \
michael@0 2138 "pmaddubsw " MEMACCESS([kYuvConstants]) ",%%xmm0 \n" \
michael@0 2139 "pmaddubsw " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1 \n" \
michael@0 2140 "pmaddubsw " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2 \n" \
michael@0 2141 "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \
michael@0 2142 "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \
michael@0 2143 "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \
michael@0 2144 "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \
michael@0 2145 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
michael@0 2146 "punpcklbw %%xmm4,%%xmm3 \n" \
michael@0 2147 "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \
michael@0 2148 "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \
michael@0 2149 "paddsw %%xmm3,%%xmm0 \n" \
michael@0 2150 "paddsw %%xmm3,%%xmm1 \n" \
michael@0 2151 "paddsw %%xmm3,%%xmm2 \n" \
michael@0 2152 "psraw $0x6,%%xmm0 \n" \
michael@0 2153 "psraw $0x6,%%xmm1 \n" \
michael@0 2154 "psraw $0x6,%%xmm2 \n" \
michael@0 2155 "packuswb %%xmm0,%%xmm0 \n" \
michael@0 2156 "packuswb %%xmm1,%%xmm1 \n" \
michael@0 2157 "packuswb %%xmm2,%%xmm2 \n"
michael@0 2158
michael@0 2159 // Convert 8 pixels: 8 VU and 8 Y
michael@0 2160 #define YVUTORGB \
michael@0 2161 "movdqa %%xmm0,%%xmm1 \n" \
michael@0 2162 "movdqa %%xmm0,%%xmm2 \n" \
michael@0 2163 "pmaddubsw " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0 \n" \
michael@0 2164 "pmaddubsw " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1 \n" \
michael@0 2165 "pmaddubsw " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2 \n" \
michael@0 2166 "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \
michael@0 2167 "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \
michael@0 2168 "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \
michael@0 2169 "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \
michael@0 2170 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \
michael@0 2171 "punpcklbw %%xmm4,%%xmm3 \n" \
michael@0 2172 "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \
michael@0 2173 "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \
michael@0 2174 "paddsw %%xmm3,%%xmm0 \n" \
michael@0 2175 "paddsw %%xmm3,%%xmm1 \n" \
michael@0 2176 "paddsw %%xmm3,%%xmm2 \n" \
michael@0 2177 "psraw $0x6,%%xmm0 \n" \
michael@0 2178 "psraw $0x6,%%xmm1 \n" \
michael@0 2179 "psraw $0x6,%%xmm2 \n" \
michael@0 2180 "packuswb %%xmm0,%%xmm0 \n" \
michael@0 2181 "packuswb %%xmm1,%%xmm1 \n" \
michael@0 2182 "packuswb %%xmm2,%%xmm2 \n"
michael@0 2183
michael@0 2184 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
michael@0 2185 const uint8* u_buf,
michael@0 2186 const uint8* v_buf,
michael@0 2187 uint8* dst_argb,
michael@0 2188 int width) {
michael@0 2189 asm volatile (
michael@0 2190 "sub %[u_buf],%[v_buf] \n"
michael@0 2191 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2192 "pxor %%xmm4,%%xmm4 \n"
michael@0 2193 LABELALIGN
michael@0 2194 "1: \n"
michael@0 2195 READYUV444
michael@0 2196 YUVTORGB
michael@0 2197 "punpcklbw %%xmm1,%%xmm0 \n"
michael@0 2198 "punpcklbw %%xmm5,%%xmm2 \n"
michael@0 2199 "movdqa %%xmm0,%%xmm1 \n"
michael@0 2200 "punpcklwd %%xmm2,%%xmm0 \n"
michael@0 2201 "punpckhwd %%xmm2,%%xmm1 \n"
michael@0 2202 "movdqa %%xmm0," MEMACCESS([dst_argb]) " \n"
michael@0 2203 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n"
michael@0 2204 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
michael@0 2205 "sub $0x8,%[width] \n"
michael@0 2206 "jg 1b \n"
michael@0 2207 : [y_buf]"+r"(y_buf), // %[y_buf]
michael@0 2208 [u_buf]"+r"(u_buf), // %[u_buf]
michael@0 2209 [v_buf]"+r"(v_buf), // %[v_buf]
michael@0 2210 [dst_argb]"+r"(dst_argb), // %[dst_argb]
michael@0 2211 [width]"+rm"(width) // %[width]
michael@0 2212 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
michael@0 2213 : "memory", "cc"
michael@0 2214 #if defined(__native_client__) && defined(__x86_64__)
michael@0 2215 , "r14"
michael@0 2216 #endif
michael@0 2217 #if defined(__SSE2__)
michael@0 2218 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 2219 #endif
michael@0 2220 );
michael@0 2221 }
michael@0 2222
michael@0 2223 void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
michael@0 2224 const uint8* u_buf,
michael@0 2225 const uint8* v_buf,
michael@0 2226 uint8* dst_rgb24,
michael@0 2227 int width) {
michael@0 2228 // fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
michael@0 2229 #if defined(__i386__)
michael@0 2230 asm volatile (
michael@0 2231 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
michael@0 2232 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
michael@0 2233 :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
michael@0 2234 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
michael@0 2235 #endif
michael@0 2236
michael@0 2237 asm volatile (
michael@0 2238 #if !defined(__i386__)
michael@0 2239 "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
michael@0 2240 "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n"
michael@0 2241 #endif
michael@0 2242 "sub %[u_buf],%[v_buf] \n"
michael@0 2243 "pxor %%xmm4,%%xmm4 \n"
michael@0 2244 LABELALIGN
michael@0 2245 "1: \n"
michael@0 2246 READYUV422
michael@0 2247 YUVTORGB
michael@0 2248 "punpcklbw %%xmm1,%%xmm0 \n"
michael@0 2249 "punpcklbw %%xmm2,%%xmm2 \n"
michael@0 2250 "movdqa %%xmm0,%%xmm1 \n"
michael@0 2251 "punpcklwd %%xmm2,%%xmm0 \n"
michael@0 2252 "punpckhwd %%xmm2,%%xmm1 \n"
michael@0 2253 "pshufb %%xmm5,%%xmm0 \n"
michael@0 2254 "pshufb %%xmm6,%%xmm1 \n"
michael@0 2255 "palignr $0xc,%%xmm0,%%xmm1 \n"
michael@0 2256 "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n"
michael@0 2257 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
michael@0 2258 "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
michael@0 2259 "sub $0x8,%[width] \n"
michael@0 2260 "jg 1b \n"
michael@0 2261 : [y_buf]"+r"(y_buf), // %[y_buf]
michael@0 2262 [u_buf]"+r"(u_buf), // %[u_buf]
michael@0 2263 [v_buf]"+r"(v_buf), // %[v_buf]
michael@0 2264 [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24]
michael@0 2265 [width]"+rm"(width) // %[width]
michael@0 2266 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
michael@0 2267 #if !defined(__i386__)
michael@0 2268 , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
michael@0 2269 [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
michael@0 2270 #endif
michael@0 2271 : "memory", "cc"
michael@0 2272 #if defined(__native_client__) && defined(__x86_64__)
michael@0 2273 , "r14"
michael@0 2274 #endif
michael@0 2275 #if defined(__SSE2__)
michael@0 2276 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
michael@0 2277 #endif
michael@0 2278 );
michael@0 2279 }
michael@0 2280
michael@0 2281 void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
michael@0 2282 const uint8* u_buf,
michael@0 2283 const uint8* v_buf,
michael@0 2284 uint8* dst_raw,
michael@0 2285 int width) {
michael@0 2286 // fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
michael@0 2287 #if defined(__i386__)
michael@0 2288 asm volatile (
michael@0 2289 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
michael@0 2290 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
michael@0 2291 :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
michael@0 2292 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
michael@0 2293 #endif
michael@0 2294
michael@0 2295 asm volatile (
michael@0 2296 #if !defined(__i386__)
michael@0 2297 "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
michael@0 2298 "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n"
michael@0 2299 #endif
michael@0 2300 "sub %[u_buf],%[v_buf] \n"
michael@0 2301 "pxor %%xmm4,%%xmm4 \n"
michael@0 2302 LABELALIGN
michael@0 2303 "1: \n"
michael@0 2304 READYUV422
michael@0 2305 YUVTORGB
michael@0 2306 "punpcklbw %%xmm1,%%xmm0 \n"
michael@0 2307 "punpcklbw %%xmm2,%%xmm2 \n"
michael@0 2308 "movdqa %%xmm0,%%xmm1 \n"
michael@0 2309 "punpcklwd %%xmm2,%%xmm0 \n"
michael@0 2310 "punpckhwd %%xmm2,%%xmm1 \n"
michael@0 2311 "pshufb %%xmm5,%%xmm0 \n"
michael@0 2312 "pshufb %%xmm6,%%xmm1 \n"
michael@0 2313 "palignr $0xc,%%xmm0,%%xmm1 \n"
michael@0 2314 "movq %%xmm0," MEMACCESS([dst_raw]) " \n"
michael@0 2315 "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
michael@0 2316 "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
michael@0 2317 "sub $0x8,%[width] \n"
michael@0 2318 "jg 1b \n"
michael@0 2319 : [y_buf]"+r"(y_buf), // %[y_buf]
michael@0 2320 [u_buf]"+r"(u_buf), // %[u_buf]
michael@0 2321 [v_buf]"+r"(v_buf), // %[v_buf]
michael@0 2322 [dst_raw]"+r"(dst_raw), // %[dst_raw]
michael@0 2323 [width]"+rm"(width) // %[width]
michael@0 2324 : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
michael@0 2325 #if !defined(__i386__)
michael@0 2326 , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
michael@0 2327 [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
michael@0 2328 #endif
michael@0 2329 : "memory", "cc"
michael@0 2330 #if defined(__native_client__) && defined(__x86_64__)
michael@0 2331 , "r14"
michael@0 2332 #endif
michael@0 2333 #if defined(__SSE2__)
michael@0 2334 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
michael@0 2335 #endif
michael@0 2336 );
michael@0 2337 }
michael@0 2338
michael@0 2339 void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
michael@0 2340 const uint8* u_buf,
michael@0 2341 const uint8* v_buf,
michael@0 2342 uint8* dst_argb,
michael@0 2343 int width) {
michael@0 2344 asm volatile (
michael@0 2345 "sub %[u_buf],%[v_buf] \n"
michael@0 2346 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2347 "pxor %%xmm4,%%xmm4 \n"
michael@0 2348 LABELALIGN
michael@0 2349 "1: \n"
michael@0 2350 READYUV422
michael@0 2351 YUVTORGB
michael@0 2352 "punpcklbw %%xmm1,%%xmm0 \n"
michael@0 2353 "punpcklbw %%xmm5,%%xmm2 \n"
michael@0 2354 "movdqa %%xmm0,%%xmm1 \n"
michael@0 2355 "punpcklwd %%xmm2,%%xmm0 \n"
michael@0 2356 "punpckhwd %%xmm2,%%xmm1 \n"
michael@0 2357 "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
michael@0 2358 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
michael@0 2359 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
michael@0 2360 "sub $0x8,%[width] \n"
michael@0 2361 "jg 1b \n"
michael@0 2362 : [y_buf]"+r"(y_buf), // %[y_buf]
michael@0 2363 [u_buf]"+r"(u_buf), // %[u_buf]
michael@0 2364 [v_buf]"+r"(v_buf), // %[v_buf]
michael@0 2365 [dst_argb]"+r"(dst_argb), // %[dst_argb]
michael@0 2366 [width]"+rm"(width) // %[width]
michael@0 2367 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
michael@0 2368 : "memory", "cc"
michael@0 2369 #if defined(__native_client__) && defined(__x86_64__)
michael@0 2370 , "r14"
michael@0 2371 #endif
michael@0 2372 #if defined(__SSE2__)
michael@0 2373 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 2374 #endif
michael@0 2375 );
michael@0 2376 }
michael@0 2377
michael@0 2378 void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
michael@0 2379 const uint8* u_buf,
michael@0 2380 const uint8* v_buf,
michael@0 2381 uint8* dst_argb,
michael@0 2382 int width) {
michael@0 2383 asm volatile (
michael@0 2384 "sub %[u_buf],%[v_buf] \n"
michael@0 2385 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2386 "pxor %%xmm4,%%xmm4 \n"
michael@0 2387 LABELALIGN
michael@0 2388 "1: \n"
michael@0 2389 READYUV411
michael@0 2390 YUVTORGB
michael@0 2391 "punpcklbw %%xmm1,%%xmm0 \n"
michael@0 2392 "punpcklbw %%xmm5,%%xmm2 \n"
michael@0 2393 "movdqa %%xmm0,%%xmm1 \n"
michael@0 2394 "punpcklwd %%xmm2,%%xmm0 \n"
michael@0 2395 "punpckhwd %%xmm2,%%xmm1 \n"
michael@0 2396 "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
michael@0 2397 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
michael@0 2398 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
michael@0 2399 "sub $0x8,%[width] \n"
michael@0 2400 "jg 1b \n"
michael@0 2401 : [y_buf]"+r"(y_buf), // %[y_buf]
michael@0 2402 [u_buf]"+r"(u_buf), // %[u_buf]
michael@0 2403 [v_buf]"+r"(v_buf), // %[v_buf]
michael@0 2404 [dst_argb]"+r"(dst_argb), // %[dst_argb]
michael@0 2405 [width]"+rm"(width) // %[width]
michael@0 2406 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
michael@0 2407 : "memory", "cc"
michael@0 2408 #if defined(__native_client__) && defined(__x86_64__)
michael@0 2409 , "r14"
michael@0 2410 #endif
michael@0 2411 #if defined(__SSE2__)
michael@0 2412 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 2413 #endif
michael@0 2414 );
michael@0 2415 }
michael@0 2416
michael@0 2417 void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
michael@0 2418 const uint8* uv_buf,
michael@0 2419 uint8* dst_argb,
michael@0 2420 int width) {
michael@0 2421 asm volatile (
michael@0 2422 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2423 "pxor %%xmm4,%%xmm4 \n"
michael@0 2424 LABELALIGN
michael@0 2425 "1: \n"
michael@0 2426 READNV12
michael@0 2427 YUVTORGB
michael@0 2428 "punpcklbw %%xmm1,%%xmm0 \n"
michael@0 2429 "punpcklbw %%xmm5,%%xmm2 \n"
michael@0 2430 "movdqa %%xmm0,%%xmm1 \n"
michael@0 2431 "punpcklwd %%xmm2,%%xmm0 \n"
michael@0 2432 "punpckhwd %%xmm2,%%xmm1 \n"
michael@0 2433 "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
michael@0 2434 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
michael@0 2435 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
michael@0 2436 "sub $0x8,%[width] \n"
michael@0 2437 "jg 1b \n"
michael@0 2438 : [y_buf]"+r"(y_buf), // %[y_buf]
michael@0 2439 [uv_buf]"+r"(uv_buf), // %[uv_buf]
michael@0 2440 [dst_argb]"+r"(dst_argb), // %[dst_argb]
michael@0 2441 [width]"+rm"(width) // %[width]
michael@0 2442 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
michael@0 2443 : "memory", "cc"
michael@0 2444 // Does not use r14.
michael@0 2445 #if defined(__SSE2__)
michael@0 2446 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 2447 #endif
michael@0 2448 );
michael@0 2449 }
michael@0 2450
michael@0 2451 void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
michael@0 2452 const uint8* uv_buf,
michael@0 2453 uint8* dst_argb,
michael@0 2454 int width) {
michael@0 2455 asm volatile (
michael@0 2456 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2457 "pxor %%xmm4,%%xmm4 \n"
michael@0 2458 LABELALIGN
michael@0 2459 "1: \n"
michael@0 2460 READNV12
michael@0 2461 YVUTORGB
michael@0 2462 "punpcklbw %%xmm1,%%xmm0 \n"
michael@0 2463 "punpcklbw %%xmm5,%%xmm2 \n"
michael@0 2464 "movdqa %%xmm0,%%xmm1 \n"
michael@0 2465 "punpcklwd %%xmm2,%%xmm0 \n"
michael@0 2466 "punpckhwd %%xmm2,%%xmm1 \n"
michael@0 2467 "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n"
michael@0 2468 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
michael@0 2469 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
michael@0 2470 "sub $0x8,%[width] \n"
michael@0 2471 "jg 1b \n"
michael@0 2472 : [y_buf]"+r"(y_buf), // %[y_buf]
michael@0 2473 [uv_buf]"+r"(uv_buf), // %[uv_buf]
michael@0 2474 [dst_argb]"+r"(dst_argb), // %[dst_argb]
michael@0 2475 [width]"+rm"(width) // %[width]
michael@0 2476 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
michael@0 2477 : "memory", "cc"
michael@0 2478 // Does not use r14.
michael@0 2479 #if defined(__SSE2__)
michael@0 2480 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 2481 #endif
michael@0 2482 );
michael@0 2483 }
michael@0 2484
michael@0 2485 void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
michael@0 2486 const uint8* u_buf,
michael@0 2487 const uint8* v_buf,
michael@0 2488 uint8* dst_argb,
michael@0 2489 int width) {
michael@0 2490 asm volatile (
michael@0 2491 "sub %[u_buf],%[v_buf] \n"
michael@0 2492 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2493 "pxor %%xmm4,%%xmm4 \n"
michael@0 2494 LABELALIGN
michael@0 2495 "1: \n"
michael@0 2496 READYUV444
michael@0 2497 YUVTORGB
michael@0 2498 "punpcklbw %%xmm1,%%xmm0 \n"
michael@0 2499 "punpcklbw %%xmm5,%%xmm2 \n"
michael@0 2500 "movdqa %%xmm0,%%xmm1 \n"
michael@0 2501 "punpcklwd %%xmm2,%%xmm0 \n"
michael@0 2502 "punpckhwd %%xmm2,%%xmm1 \n"
michael@0 2503 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
michael@0 2504 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
michael@0 2505 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
michael@0 2506 "sub $0x8,%[width] \n"
michael@0 2507 "jg 1b \n"
michael@0 2508 : [y_buf]"+r"(y_buf), // %[y_buf]
michael@0 2509 [u_buf]"+r"(u_buf), // %[u_buf]
michael@0 2510 [v_buf]"+r"(v_buf), // %[v_buf]
michael@0 2511 [dst_argb]"+r"(dst_argb), // %[dst_argb]
michael@0 2512 [width]"+rm"(width) // %[width]
michael@0 2513 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
michael@0 2514 : "memory", "cc"
michael@0 2515 #if defined(__native_client__) && defined(__x86_64__)
michael@0 2516 , "r14"
michael@0 2517 #endif
michael@0 2518 #if defined(__SSE2__)
michael@0 2519 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 2520 #endif
michael@0 2521 );
michael@0 2522 }
michael@0 2523
michael@0 2524 void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
michael@0 2525 const uint8* u_buf,
michael@0 2526 const uint8* v_buf,
michael@0 2527 uint8* dst_argb,
michael@0 2528 int width) {
michael@0 2529 asm volatile (
michael@0 2530 "sub %[u_buf],%[v_buf] \n"
michael@0 2531 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2532 "pxor %%xmm4,%%xmm4 \n"
michael@0 2533 LABELALIGN
michael@0 2534 "1: \n"
michael@0 2535 READYUV422
michael@0 2536 YUVTORGB
michael@0 2537 "punpcklbw %%xmm1,%%xmm0 \n"
michael@0 2538 "punpcklbw %%xmm5,%%xmm2 \n"
michael@0 2539 "movdqa %%xmm0,%%xmm1 \n"
michael@0 2540 "punpcklwd %%xmm2,%%xmm0 \n"
michael@0 2541 "punpckhwd %%xmm2,%%xmm1 \n"
michael@0 2542 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
michael@0 2543 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
michael@0 2544 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
michael@0 2545 "sub $0x8,%[width] \n"
michael@0 2546 "jg 1b \n"
michael@0 2547 : [y_buf]"+r"(y_buf), // %[y_buf]
michael@0 2548 [u_buf]"+r"(u_buf), // %[u_buf]
michael@0 2549 [v_buf]"+r"(v_buf), // %[v_buf]
michael@0 2550 [dst_argb]"+r"(dst_argb), // %[dst_argb]
michael@0 2551 [width]"+rm"(width) // %[width]
michael@0 2552 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
michael@0 2553 : "memory", "cc"
michael@0 2554 #if defined(__native_client__) && defined(__x86_64__)
michael@0 2555 , "r14"
michael@0 2556 #endif
michael@0 2557 #if defined(__SSE2__)
michael@0 2558 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 2559 #endif
michael@0 2560 );
michael@0 2561 }
michael@0 2562
michael@0 2563 void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
michael@0 2564 const uint8* u_buf,
michael@0 2565 const uint8* v_buf,
michael@0 2566 uint8* dst_argb,
michael@0 2567 int width) {
michael@0 2568 asm volatile (
michael@0 2569 "sub %[u_buf],%[v_buf] \n"
michael@0 2570 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2571 "pxor %%xmm4,%%xmm4 \n"
michael@0 2572 LABELALIGN
michael@0 2573 "1: \n"
michael@0 2574 READYUV411
michael@0 2575 YUVTORGB
michael@0 2576 "punpcklbw %%xmm1,%%xmm0 \n"
michael@0 2577 "punpcklbw %%xmm5,%%xmm2 \n"
michael@0 2578 "movdqa %%xmm0,%%xmm1 \n"
michael@0 2579 "punpcklwd %%xmm2,%%xmm0 \n"
michael@0 2580 "punpckhwd %%xmm2,%%xmm1 \n"
michael@0 2581 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
michael@0 2582 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
michael@0 2583 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
michael@0 2584 "sub $0x8,%[width] \n"
michael@0 2585 "jg 1b \n"
michael@0 2586 : [y_buf]"+r"(y_buf), // %[y_buf]
michael@0 2587 [u_buf]"+r"(u_buf), // %[u_buf]
michael@0 2588 [v_buf]"+r"(v_buf), // %[v_buf]
michael@0 2589 [dst_argb]"+r"(dst_argb), // %[dst_argb]
michael@0 2590 [width]"+rm"(width) // %[width]
michael@0 2591 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
michael@0 2592 : "memory", "cc"
michael@0 2593 #if defined(__native_client__) && defined(__x86_64__)
michael@0 2594 , "r14"
michael@0 2595 #endif
michael@0 2596 #if defined(__SSE2__)
michael@0 2597 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 2598 #endif
michael@0 2599 );
michael@0 2600 }
michael@0 2601
michael@0 2602 void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
michael@0 2603 const uint8* uv_buf,
michael@0 2604 uint8* dst_argb,
michael@0 2605 int width) {
michael@0 2606 asm volatile (
michael@0 2607 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2608 "pxor %%xmm4,%%xmm4 \n"
michael@0 2609 LABELALIGN
michael@0 2610 "1: \n"
michael@0 2611 READNV12
michael@0 2612 YUVTORGB
michael@0 2613 "punpcklbw %%xmm1,%%xmm0 \n"
michael@0 2614 "punpcklbw %%xmm5,%%xmm2 \n"
michael@0 2615 "movdqa %%xmm0,%%xmm1 \n"
michael@0 2616 "punpcklwd %%xmm2,%%xmm0 \n"
michael@0 2617 "punpckhwd %%xmm2,%%xmm1 \n"
michael@0 2618 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
michael@0 2619 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
michael@0 2620 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
michael@0 2621 "sub $0x8,%[width] \n"
michael@0 2622 "jg 1b \n"
michael@0 2623 : [y_buf]"+r"(y_buf), // %[y_buf]
michael@0 2624 [uv_buf]"+r"(uv_buf), // %[uv_buf]
michael@0 2625 [dst_argb]"+r"(dst_argb), // %[dst_argb]
michael@0 2626 [width]"+rm"(width) // %[width]
michael@0 2627 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
michael@0 2628 : "memory", "cc"
michael@0 2629 // Does not use r14.
michael@0 2630 #if defined(__SSE2__)
michael@0 2631 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 2632 #endif
michael@0 2633 );
michael@0 2634 }
michael@0 2635
michael@0 2636 void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
michael@0 2637 const uint8* uv_buf,
michael@0 2638 uint8* dst_argb,
michael@0 2639 int width) {
michael@0 2640 asm volatile (
michael@0 2641 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2642 "pxor %%xmm4,%%xmm4 \n"
michael@0 2643 LABELALIGN
michael@0 2644 "1: \n"
michael@0 2645 READNV12
michael@0 2646 YVUTORGB
michael@0 2647 "punpcklbw %%xmm1,%%xmm0 \n"
michael@0 2648 "punpcklbw %%xmm5,%%xmm2 \n"
michael@0 2649 "movdqa %%xmm0,%%xmm1 \n"
michael@0 2650 "punpcklwd %%xmm2,%%xmm0 \n"
michael@0 2651 "punpckhwd %%xmm2,%%xmm1 \n"
michael@0 2652 "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n"
michael@0 2653 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
michael@0 2654 "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
michael@0 2655 "sub $0x8,%[width] \n"
michael@0 2656 "jg 1b \n"
michael@0 2657 : [y_buf]"+r"(y_buf), // %[y_buf]
michael@0 2658 [uv_buf]"+r"(uv_buf), // %[uv_buf]
michael@0 2659 [dst_argb]"+r"(dst_argb), // %[dst_argb]
michael@0 2660 [width]"+rm"(width) // %[width]
michael@0 2661 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
michael@0 2662 : "memory", "cc"
michael@0 2663 // Does not use r14.
michael@0 2664 #if defined(__SSE2__)
michael@0 2665 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 2666 #endif
michael@0 2667 );
michael@0 2668 }
michael@0 2669
michael@0 2670 void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
michael@0 2671 const uint8* u_buf,
michael@0 2672 const uint8* v_buf,
michael@0 2673 uint8* dst_bgra,
michael@0 2674 int width) {
michael@0 2675 asm volatile (
michael@0 2676 "sub %[u_buf],%[v_buf] \n"
michael@0 2677 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2678 "pxor %%xmm4,%%xmm4 \n"
michael@0 2679 LABELALIGN
michael@0 2680 "1: \n"
michael@0 2681 READYUV422
michael@0 2682 YUVTORGB
michael@0 2683 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2684 "punpcklbw %%xmm0,%%xmm1 \n"
michael@0 2685 "punpcklbw %%xmm2,%%xmm5 \n"
michael@0 2686 "movdqa %%xmm5,%%xmm0 \n"
michael@0 2687 "punpcklwd %%xmm1,%%xmm5 \n"
michael@0 2688 "punpckhwd %%xmm1,%%xmm0 \n"
michael@0 2689 "movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n"
michael@0 2690 "movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
michael@0 2691 "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
michael@0 2692 "sub $0x8,%[width] \n"
michael@0 2693 "jg 1b \n"
michael@0 2694 : [y_buf]"+r"(y_buf), // %[y_buf]
michael@0 2695 [u_buf]"+r"(u_buf), // %[u_buf]
michael@0 2696 [v_buf]"+r"(v_buf), // %[v_buf]
michael@0 2697 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
michael@0 2698 [width]"+rm"(width) // %[width]
michael@0 2699 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
michael@0 2700 : "memory", "cc"
michael@0 2701 #if defined(__native_client__) && defined(__x86_64__)
michael@0 2702 , "r14"
michael@0 2703 #endif
michael@0 2704 #if defined(__SSE2__)
michael@0 2705 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 2706 #endif
michael@0 2707 );
michael@0 2708 }
michael@0 2709
michael@0 2710 void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
michael@0 2711 const uint8* u_buf,
michael@0 2712 const uint8* v_buf,
michael@0 2713 uint8* dst_abgr,
michael@0 2714 int width) {
michael@0 2715 asm volatile (
michael@0 2716 "sub %[u_buf],%[v_buf] \n"
michael@0 2717 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2718 "pxor %%xmm4,%%xmm4 \n"
michael@0 2719 LABELALIGN
michael@0 2720 "1: \n"
michael@0 2721 READYUV422
michael@0 2722 YUVTORGB
michael@0 2723 "punpcklbw %%xmm1,%%xmm2 \n"
michael@0 2724 "punpcklbw %%xmm5,%%xmm0 \n"
michael@0 2725 "movdqa %%xmm2,%%xmm1 \n"
michael@0 2726 "punpcklwd %%xmm0,%%xmm2 \n"
michael@0 2727 "punpckhwd %%xmm0,%%xmm1 \n"
michael@0 2728 "movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n"
michael@0 2729 "movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
michael@0 2730 "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
michael@0 2731 "sub $0x8,%[width] \n"
michael@0 2732 "jg 1b \n"
michael@0 2733 : [y_buf]"+r"(y_buf), // %[y_buf]
michael@0 2734 [u_buf]"+r"(u_buf), // %[u_buf]
michael@0 2735 [v_buf]"+r"(v_buf), // %[v_buf]
michael@0 2736 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
michael@0 2737 [width]"+rm"(width) // %[width]
michael@0 2738 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
michael@0 2739 : "memory", "cc"
michael@0 2740 #if defined(__native_client__) && defined(__x86_64__)
michael@0 2741 , "r14"
michael@0 2742 #endif
michael@0 2743 #if defined(__SSE2__)
michael@0 2744 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 2745 #endif
michael@0 2746 );
michael@0 2747 }
michael@0 2748
michael@0 2749 void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
michael@0 2750 const uint8* u_buf,
michael@0 2751 const uint8* v_buf,
michael@0 2752 uint8* dst_rgba,
michael@0 2753 int width) {
michael@0 2754 asm volatile (
michael@0 2755 "sub %[u_buf],%[v_buf] \n"
michael@0 2756 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2757 "pxor %%xmm4,%%xmm4 \n"
michael@0 2758 LABELALIGN
michael@0 2759 "1: \n"
michael@0 2760 READYUV422
michael@0 2761 YUVTORGB
michael@0 2762 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2763 "punpcklbw %%xmm2,%%xmm1 \n"
michael@0 2764 "punpcklbw %%xmm0,%%xmm5 \n"
michael@0 2765 "movdqa %%xmm5,%%xmm0 \n"
michael@0 2766 "punpcklwd %%xmm1,%%xmm5 \n"
michael@0 2767 "punpckhwd %%xmm1,%%xmm0 \n"
michael@0 2768 "movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n"
michael@0 2769 "movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
michael@0 2770 "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
michael@0 2771 "sub $0x8,%[width] \n"
michael@0 2772 "jg 1b \n"
michael@0 2773 : [y_buf]"+r"(y_buf), // %[y_buf]
michael@0 2774 [u_buf]"+r"(u_buf), // %[u_buf]
michael@0 2775 [v_buf]"+r"(v_buf), // %[v_buf]
michael@0 2776 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
michael@0 2777 [width]"+rm"(width) // %[width]
michael@0 2778 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
michael@0 2779 : "memory", "cc"
michael@0 2780 #if defined(__native_client__) && defined(__x86_64__)
michael@0 2781 , "r14"
michael@0 2782 #endif
michael@0 2783 #if defined(__SSE2__)
michael@0 2784 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 2785 #endif
michael@0 2786 );
michael@0 2787 }
michael@0 2788
michael@0 2789 void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
michael@0 2790 const uint8* u_buf,
michael@0 2791 const uint8* v_buf,
michael@0 2792 uint8* dst_bgra,
michael@0 2793 int width) {
michael@0 2794 asm volatile (
michael@0 2795 "sub %[u_buf],%[v_buf] \n"
michael@0 2796 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2797 "pxor %%xmm4,%%xmm4 \n"
michael@0 2798 LABELALIGN
michael@0 2799 "1: \n"
michael@0 2800 READYUV422
michael@0 2801 YUVTORGB
michael@0 2802 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2803 "punpcklbw %%xmm0,%%xmm1 \n"
michael@0 2804 "punpcklbw %%xmm2,%%xmm5 \n"
michael@0 2805 "movdqa %%xmm5,%%xmm0 \n"
michael@0 2806 "punpcklwd %%xmm1,%%xmm5 \n"
michael@0 2807 "punpckhwd %%xmm1,%%xmm0 \n"
michael@0 2808 "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n"
michael@0 2809 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
michael@0 2810 "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
michael@0 2811 "sub $0x8,%[width] \n"
michael@0 2812 "jg 1b \n"
michael@0 2813 : [y_buf]"+r"(y_buf), // %[y_buf]
michael@0 2814 [u_buf]"+r"(u_buf), // %[u_buf]
michael@0 2815 [v_buf]"+r"(v_buf), // %[v_buf]
michael@0 2816 [dst_bgra]"+r"(dst_bgra), // %[dst_bgra]
michael@0 2817 [width]"+rm"(width) // %[width]
michael@0 2818 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
michael@0 2819 : "memory", "cc"
michael@0 2820 #if defined(__native_client__) && defined(__x86_64__)
michael@0 2821 , "r14"
michael@0 2822 #endif
michael@0 2823 #if defined(__SSE2__)
michael@0 2824 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 2825 #endif
michael@0 2826 );
michael@0 2827 }
michael@0 2828
michael@0 2829 void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
michael@0 2830 const uint8* u_buf,
michael@0 2831 const uint8* v_buf,
michael@0 2832 uint8* dst_abgr,
michael@0 2833 int width) {
michael@0 2834 asm volatile (
michael@0 2835 "sub %[u_buf],%[v_buf] \n"
michael@0 2836 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2837 "pxor %%xmm4,%%xmm4 \n"
michael@0 2838 LABELALIGN
michael@0 2839 "1: \n"
michael@0 2840 READYUV422
michael@0 2841 YUVTORGB
michael@0 2842 "punpcklbw %%xmm1,%%xmm2 \n"
michael@0 2843 "punpcklbw %%xmm5,%%xmm0 \n"
michael@0 2844 "movdqa %%xmm2,%%xmm1 \n"
michael@0 2845 "punpcklwd %%xmm0,%%xmm2 \n"
michael@0 2846 "punpckhwd %%xmm0,%%xmm1 \n"
michael@0 2847 "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n"
michael@0 2848 "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
michael@0 2849 "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
michael@0 2850 "sub $0x8,%[width] \n"
michael@0 2851 "jg 1b \n"
michael@0 2852 : [y_buf]"+r"(y_buf), // %[y_buf]
michael@0 2853 [u_buf]"+r"(u_buf), // %[u_buf]
michael@0 2854 [v_buf]"+r"(v_buf), // %[v_buf]
michael@0 2855 [dst_abgr]"+r"(dst_abgr), // %[dst_abgr]
michael@0 2856 [width]"+rm"(width) // %[width]
michael@0 2857 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
michael@0 2858 : "memory", "cc"
michael@0 2859 #if defined(__native_client__) && defined(__x86_64__)
michael@0 2860 , "r14"
michael@0 2861 #endif
michael@0 2862 #if defined(__SSE2__)
michael@0 2863 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 2864 #endif
michael@0 2865 );
michael@0 2866 }
michael@0 2867
michael@0 2868 void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
michael@0 2869 const uint8* u_buf,
michael@0 2870 const uint8* v_buf,
michael@0 2871 uint8* dst_rgba,
michael@0 2872 int width) {
michael@0 2873 asm volatile (
michael@0 2874 "sub %[u_buf],%[v_buf] \n"
michael@0 2875 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2876 "pxor %%xmm4,%%xmm4 \n"
michael@0 2877 LABELALIGN
michael@0 2878 "1: \n"
michael@0 2879 READYUV422
michael@0 2880 YUVTORGB
michael@0 2881 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 2882 "punpcklbw %%xmm2,%%xmm1 \n"
michael@0 2883 "punpcklbw %%xmm0,%%xmm5 \n"
michael@0 2884 "movdqa %%xmm5,%%xmm0 \n"
michael@0 2885 "punpcklwd %%xmm1,%%xmm5 \n"
michael@0 2886 "punpckhwd %%xmm1,%%xmm0 \n"
michael@0 2887 "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n"
michael@0 2888 "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
michael@0 2889 "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
michael@0 2890 "sub $0x8,%[width] \n"
michael@0 2891 "jg 1b \n"
michael@0 2892 : [y_buf]"+r"(y_buf), // %[y_buf]
michael@0 2893 [u_buf]"+r"(u_buf), // %[u_buf]
michael@0 2894 [v_buf]"+r"(v_buf), // %[v_buf]
michael@0 2895 [dst_rgba]"+r"(dst_rgba), // %[dst_rgba]
michael@0 2896 [width]"+rm"(width) // %[width]
michael@0 2897 : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
michael@0 2898 : "memory", "cc"
michael@0 2899 #if defined(__native_client__) && defined(__x86_64__)
michael@0 2900 , "r14"
michael@0 2901 #endif
michael@0 2902 #if defined(__SSE2__)
michael@0 2903 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 2904 #endif
michael@0 2905 );
michael@0 2906 }
michael@0 2907
michael@0 2908 #endif // HAS_I422TOARGBROW_SSSE3
michael@0 2909
michael@0 2910 #ifdef HAS_YTOARGBROW_SSE2
michael@0 2911 void YToARGBRow_SSE2(const uint8* y_buf,
michael@0 2912 uint8* dst_argb,
michael@0 2913 int width) {
michael@0 2914 asm volatile (
michael@0 2915 "pxor %%xmm5,%%xmm5 \n"
michael@0 2916 "pcmpeqb %%xmm4,%%xmm4 \n"
michael@0 2917 "pslld $0x18,%%xmm4 \n"
michael@0 2918 "mov $0x00100010,%%eax \n"
michael@0 2919 "movd %%eax,%%xmm3 \n"
michael@0 2920 "pshufd $0x0,%%xmm3,%%xmm3 \n"
michael@0 2921 "mov $0x004a004a,%%eax \n"
michael@0 2922 "movd %%eax,%%xmm2 \n"
michael@0 2923 "pshufd $0x0,%%xmm2,%%xmm2 \n"
michael@0 2924 LABELALIGN
michael@0 2925 "1: \n"
michael@0 2926 // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
michael@0 2927 "movq " MEMACCESS(0) ",%%xmm0 \n"
michael@0 2928 "lea " MEMLEA(0x8,0) ",%0 \n"
michael@0 2929 "punpcklbw %%xmm5,%%xmm0 \n"
michael@0 2930 "psubusw %%xmm3,%%xmm0 \n"
michael@0 2931 "pmullw %%xmm2,%%xmm0 \n"
michael@0 2932 "psrlw $6, %%xmm0 \n"
michael@0 2933 "packuswb %%xmm0,%%xmm0 \n"
michael@0 2934
michael@0 2935 // Step 2: Weave into ARGB
michael@0 2936 "punpcklbw %%xmm0,%%xmm0 \n"
michael@0 2937 "movdqa %%xmm0,%%xmm1 \n"
michael@0 2938 "punpcklwd %%xmm0,%%xmm0 \n"
michael@0 2939 "punpckhwd %%xmm1,%%xmm1 \n"
michael@0 2940 "por %%xmm4,%%xmm0 \n"
michael@0 2941 "por %%xmm4,%%xmm1 \n"
michael@0 2942 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 2943 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
michael@0 2944 "lea " MEMLEA(0x20,1) ",%1 \n"
michael@0 2945
michael@0 2946 "sub $0x8,%2 \n"
michael@0 2947 "jg 1b \n"
michael@0 2948 : "+r"(y_buf), // %0
michael@0 2949 "+r"(dst_argb), // %1
michael@0 2950 "+rm"(width) // %2
michael@0 2951 :
michael@0 2952 : "memory", "cc", "eax"
michael@0 2953 #if defined(__SSE2__)
michael@0 2954 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
michael@0 2955 #endif
michael@0 2956 );
michael@0 2957 }
michael@0 2958 #endif // HAS_YTOARGBROW_SSE2
michael@0 2959
michael@0 2960 #ifdef HAS_MIRRORROW_SSSE3
michael@0 2961 // Shuffle table for reversing the bytes.
michael@0 2962 static uvec8 kShuffleMirror = {
michael@0 2963 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
michael@0 2964 };
michael@0 2965
michael@0 2966 void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
michael@0 2967 intptr_t temp_width = (intptr_t)(width);
michael@0 2968 asm volatile (
michael@0 2969 "movdqa %3,%%xmm5 \n"
michael@0 2970 "lea " MEMLEA(-0x10,0) ",%0 \n"
michael@0 2971 LABELALIGN
michael@0 2972 "1: \n"
michael@0 2973 MEMOPREG(movdqa,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0
michael@0 2974 "pshufb %%xmm5,%%xmm0 \n"
michael@0 2975 "sub $0x10,%2 \n"
michael@0 2976 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 2977 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 2978 "jg 1b \n"
michael@0 2979 : "+r"(src), // %0
michael@0 2980 "+r"(dst), // %1
michael@0 2981 "+r"(temp_width) // %2
michael@0 2982 : "m"(kShuffleMirror) // %3
michael@0 2983 : "memory", "cc"
michael@0 2984 #if defined(__native_client__) && defined(__x86_64__)
michael@0 2985 , "r14"
michael@0 2986 #endif
michael@0 2987 #if defined(__SSE2__)
michael@0 2988 , "xmm0", "xmm5"
michael@0 2989 #endif
michael@0 2990 );
michael@0 2991 }
michael@0 2992 #endif // HAS_MIRRORROW_SSSE3
michael@0 2993
michael@0 2994 #ifdef HAS_MIRRORROW_SSE2
michael@0 2995 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
michael@0 2996 intptr_t temp_width = (intptr_t)(width);
michael@0 2997 asm volatile (
michael@0 2998 "lea " MEMLEA(-0x10,0) ",%0 \n"
michael@0 2999 LABELALIGN
michael@0 3000 "1: \n"
michael@0 3001 MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0
michael@0 3002 "movdqa %%xmm0,%%xmm1 \n"
michael@0 3003 "psllw $0x8,%%xmm0 \n"
michael@0 3004 "psrlw $0x8,%%xmm1 \n"
michael@0 3005 "por %%xmm1,%%xmm0 \n"
michael@0 3006 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
michael@0 3007 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
michael@0 3008 "pshufd $0x4e,%%xmm0,%%xmm0 \n"
michael@0 3009 "sub $0x10,%2 \n"
michael@0 3010 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 3011 "lea " MEMLEA(0x10,1)",%1 \n"
michael@0 3012 "jg 1b \n"
michael@0 3013 : "+r"(src), // %0
michael@0 3014 "+r"(dst), // %1
michael@0 3015 "+r"(temp_width) // %2
michael@0 3016 :
michael@0 3017 : "memory", "cc"
michael@0 3018 #if defined(__native_client__) && defined(__x86_64__)
michael@0 3019 , "r14"
michael@0 3020 #endif
michael@0 3021 #if defined(__SSE2__)
michael@0 3022 , "xmm0", "xmm1"
michael@0 3023 #endif
michael@0 3024 );
michael@0 3025 }
michael@0 3026 #endif // HAS_MIRRORROW_SSE2
michael@0 3027
michael@0 3028 #ifdef HAS_MIRRORROW_UV_SSSE3
michael@0 3029 // Shuffle table for reversing the bytes of UV channels.
michael@0 3030 static uvec8 kShuffleMirrorUV = {
michael@0 3031 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
michael@0 3032 };
michael@0 3033 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
michael@0 3034 int width) {
michael@0 3035 intptr_t temp_width = (intptr_t)(width);
michael@0 3036 asm volatile (
michael@0 3037 "movdqa %4,%%xmm1 \n"
michael@0 3038 "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n"
michael@0 3039 "sub %1,%2 \n"
michael@0 3040 LABELALIGN
michael@0 3041 "1: \n"
michael@0 3042 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3043 "lea " MEMLEA(-0x10,0) ",%0 \n"
michael@0 3044 "pshufb %%xmm1,%%xmm0 \n"
michael@0 3045 "sub $8,%3 \n"
michael@0 3046 "movlpd %%xmm0," MEMACCESS(1) " \n"
michael@0 3047 BUNDLEALIGN
michael@0 3048 MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2)
michael@0 3049 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 3050 "jg 1b \n"
michael@0 3051 : "+r"(src), // %0
michael@0 3052 "+r"(dst_u), // %1
michael@0 3053 "+r"(dst_v), // %2
michael@0 3054 "+r"(temp_width) // %3
michael@0 3055 : "m"(kShuffleMirrorUV) // %4
michael@0 3056 : "memory", "cc"
michael@0 3057 #if defined(__native_client__) && defined(__x86_64__)
michael@0 3058 , "r14"
michael@0 3059 #endif
michael@0 3060 #if defined(__SSE2__)
michael@0 3061 , "xmm0", "xmm1"
michael@0 3062 #endif
michael@0 3063 );
michael@0 3064 }
michael@0 3065 #endif // HAS_MIRRORROW_UV_SSSE3
michael@0 3066
michael@0 3067 #ifdef HAS_ARGBMIRRORROW_SSSE3
michael@0 3068 // Shuffle table for reversing the bytes.
michael@0 3069 static uvec8 kARGBShuffleMirror = {
michael@0 3070 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
michael@0 3071 };
michael@0 3072
michael@0 3073 void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
michael@0 3074 intptr_t temp_width = (intptr_t)(width);
michael@0 3075 asm volatile (
michael@0 3076 "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n"
michael@0 3077 "movdqa %3,%%xmm5 \n"
michael@0 3078 LABELALIGN
michael@0 3079 "1: \n"
michael@0 3080 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3081 "pshufb %%xmm5,%%xmm0 \n"
michael@0 3082 "lea " MEMLEA(-0x10,0) ",%0 \n"
michael@0 3083 "sub $0x4,%2 \n"
michael@0 3084 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 3085 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 3086 "jg 1b \n"
michael@0 3087 : "+r"(src), // %0
michael@0 3088 "+r"(dst), // %1
michael@0 3089 "+r"(temp_width) // %2
michael@0 3090 : "m"(kARGBShuffleMirror) // %3
michael@0 3091 : "memory", "cc"
michael@0 3092 #if defined(__SSE2__)
michael@0 3093 , "xmm0", "xmm5"
michael@0 3094 #endif
michael@0 3095 );
michael@0 3096 }
michael@0 3097 #endif // HAS_ARGBMIRRORROW_SSSE3
michael@0 3098
michael@0 3099 #ifdef HAS_SPLITUVROW_SSE2
michael@0 3100 void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
michael@0 3101 asm volatile (
michael@0 3102 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 3103 "psrlw $0x8,%%xmm5 \n"
michael@0 3104 "sub %1,%2 \n"
michael@0 3105 LABELALIGN
michael@0 3106 "1: \n"
michael@0 3107 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3108 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 3109 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 3110 "movdqa %%xmm0,%%xmm2 \n"
michael@0 3111 "movdqa %%xmm1,%%xmm3 \n"
michael@0 3112 "pand %%xmm5,%%xmm0 \n"
michael@0 3113 "pand %%xmm5,%%xmm1 \n"
michael@0 3114 "packuswb %%xmm1,%%xmm0 \n"
michael@0 3115 "psrlw $0x8,%%xmm2 \n"
michael@0 3116 "psrlw $0x8,%%xmm3 \n"
michael@0 3117 "packuswb %%xmm3,%%xmm2 \n"
michael@0 3118 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 3119 MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2)
michael@0 3120 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 3121 "sub $0x10,%3 \n"
michael@0 3122 "jg 1b \n"
michael@0 3123 : "+r"(src_uv), // %0
michael@0 3124 "+r"(dst_u), // %1
michael@0 3125 "+r"(dst_v), // %2
michael@0 3126 "+r"(pix) // %3
michael@0 3127 :
michael@0 3128 : "memory", "cc"
michael@0 3129 #if defined(__native_client__) && defined(__x86_64__)
michael@0 3130 , "r14"
michael@0 3131 #endif
michael@0 3132 #if defined(__SSE2__)
michael@0 3133 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
michael@0 3134 #endif
michael@0 3135 );
michael@0 3136 }
michael@0 3137
michael@0 3138 void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
michael@0 3139 int pix) {
michael@0 3140 asm volatile (
michael@0 3141 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 3142 "psrlw $0x8,%%xmm5 \n"
michael@0 3143 "sub %1,%2 \n"
michael@0 3144 LABELALIGN
michael@0 3145 "1: \n"
michael@0 3146 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3147 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 3148 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 3149 "movdqa %%xmm0,%%xmm2 \n"
michael@0 3150 "movdqa %%xmm1,%%xmm3 \n"
michael@0 3151 "pand %%xmm5,%%xmm0 \n"
michael@0 3152 "pand %%xmm5,%%xmm1 \n"
michael@0 3153 "packuswb %%xmm1,%%xmm0 \n"
michael@0 3154 "psrlw $0x8,%%xmm2 \n"
michael@0 3155 "psrlw $0x8,%%xmm3 \n"
michael@0 3156 "packuswb %%xmm3,%%xmm2 \n"
michael@0 3157 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 3158 MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2)
michael@0 3159 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 3160 "sub $0x10,%3 \n"
michael@0 3161 "jg 1b \n"
michael@0 3162 : "+r"(src_uv), // %0
michael@0 3163 "+r"(dst_u), // %1
michael@0 3164 "+r"(dst_v), // %2
michael@0 3165 "+r"(pix) // %3
michael@0 3166 :
michael@0 3167 : "memory", "cc"
michael@0 3168 #if defined(__native_client__) && defined(__x86_64__)
michael@0 3169 , "r14"
michael@0 3170 #endif
michael@0 3171 #if defined(__SSE2__)
michael@0 3172 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
michael@0 3173 #endif
michael@0 3174 );
michael@0 3175 }
michael@0 3176 #endif // HAS_SPLITUVROW_SSE2
michael@0 3177
michael@0 3178 #ifdef HAS_MERGEUVROW_SSE2
michael@0 3179 void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
michael@0 3180 int width) {
michael@0 3181 asm volatile (
michael@0 3182 "sub %0,%1 \n"
michael@0 3183 LABELALIGN
michael@0 3184 "1: \n"
michael@0 3185 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3186 MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
michael@0 3187 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 3188 "movdqa %%xmm0,%%xmm2 \n"
michael@0 3189 "punpcklbw %%xmm1,%%xmm0 \n"
michael@0 3190 "punpckhbw %%xmm1,%%xmm2 \n"
michael@0 3191 "movdqa %%xmm0," MEMACCESS(2) " \n"
michael@0 3192 "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n"
michael@0 3193 "lea " MEMLEA(0x20,2) ",%2 \n"
michael@0 3194 "sub $0x10,%3 \n"
michael@0 3195 "jg 1b \n"
michael@0 3196 : "+r"(src_u), // %0
michael@0 3197 "+r"(src_v), // %1
michael@0 3198 "+r"(dst_uv), // %2
michael@0 3199 "+r"(width) // %3
michael@0 3200 :
michael@0 3201 : "memory", "cc"
michael@0 3202 #if defined(__native_client__) && defined(__x86_64__)
michael@0 3203 , "r14"
michael@0 3204 #endif
michael@0 3205 #if defined(__SSE2__)
michael@0 3206 , "xmm0", "xmm1", "xmm2"
michael@0 3207 #endif
michael@0 3208 );
michael@0 3209 }
michael@0 3210
michael@0 3211 void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
michael@0 3212 uint8* dst_uv, int width) {
michael@0 3213 asm volatile (
michael@0 3214 "sub %0,%1 \n"
michael@0 3215 LABELALIGN
michael@0 3216 "1: \n"
michael@0 3217 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3218 MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1
michael@0 3219 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 3220 "movdqa %%xmm0,%%xmm2 \n"
michael@0 3221 "punpcklbw %%xmm1,%%xmm0 \n"
michael@0 3222 "punpckhbw %%xmm1,%%xmm2 \n"
michael@0 3223 "movdqu %%xmm0," MEMACCESS(2) " \n"
michael@0 3224 "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n"
michael@0 3225 "lea " MEMLEA(0x20,2) ",%2 \n"
michael@0 3226 "sub $0x10,%3 \n"
michael@0 3227 "jg 1b \n"
michael@0 3228 : "+r"(src_u), // %0
michael@0 3229 "+r"(src_v), // %1
michael@0 3230 "+r"(dst_uv), // %2
michael@0 3231 "+r"(width) // %3
michael@0 3232 :
michael@0 3233 : "memory", "cc"
michael@0 3234 #if defined(__native_client__) && defined(__x86_64__)
michael@0 3235 , "r14"
michael@0 3236 #endif
michael@0 3237 #if defined(__SSE2__)
michael@0 3238 , "xmm0", "xmm1", "xmm2"
michael@0 3239 #endif
michael@0 3240 );
michael@0 3241 }
michael@0 3242 #endif // HAS_MERGEUVROW_SSE2
michael@0 3243
michael@0 3244 #ifdef HAS_COPYROW_SSE2
michael@0 3245 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
michael@0 3246 asm volatile (
michael@0 3247 LABELALIGN
michael@0 3248 "1: \n"
michael@0 3249 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3250 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 3251 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 3252 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 3253 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
michael@0 3254 "lea " MEMLEA(0x20,1) ",%1 \n"
michael@0 3255 "sub $0x20,%2 \n"
michael@0 3256 "jg 1b \n"
michael@0 3257 : "+r"(src), // %0
michael@0 3258 "+r"(dst), // %1
michael@0 3259 "+r"(count) // %2
michael@0 3260 :
michael@0 3261 : "memory", "cc"
michael@0 3262 #if defined(__SSE2__)
michael@0 3263 , "xmm0", "xmm1"
michael@0 3264 #endif
michael@0 3265 );
michael@0 3266 }
michael@0 3267 #endif // HAS_COPYROW_SSE2
michael@0 3268
michael@0 3269 #ifdef HAS_COPYROW_X86
michael@0 3270 void CopyRow_X86(const uint8* src, uint8* dst, int width) {
michael@0 3271 size_t width_tmp = (size_t)(width);
michael@0 3272 asm volatile (
michael@0 3273 "shr $0x2,%2 \n"
michael@0 3274 "rep movsl " MEMMOVESTRING(0,1) " \n"
michael@0 3275 : "+S"(src), // %0
michael@0 3276 "+D"(dst), // %1
michael@0 3277 "+c"(width_tmp) // %2
michael@0 3278 :
michael@0 3279 : "memory", "cc"
michael@0 3280 );
michael@0 3281 }
michael@0 3282 #endif // HAS_COPYROW_X86
michael@0 3283
michael@0 3284 #ifdef HAS_COPYROW_ERMS
michael@0 3285 // Unaligned Multiple of 1.
michael@0 3286 void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
michael@0 3287 size_t width_tmp = (size_t)(width);
michael@0 3288 asm volatile (
michael@0 3289 "rep movsb " MEMMOVESTRING(0,1) " \n"
michael@0 3290 : "+S"(src), // %0
michael@0 3291 "+D"(dst), // %1
michael@0 3292 "+c"(width_tmp) // %2
michael@0 3293 :
michael@0 3294 : "memory", "cc"
michael@0 3295 );
michael@0 3296 }
michael@0 3297 #endif // HAS_COPYROW_ERMS
michael@0 3298
michael@0 3299 #ifdef HAS_ARGBCOPYALPHAROW_SSE2
michael@0 3300 // width in pixels
michael@0 3301 void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
michael@0 3302 asm volatile (
michael@0 3303 "pcmpeqb %%xmm0,%%xmm0 \n"
michael@0 3304 "pslld $0x18,%%xmm0 \n"
michael@0 3305 "pcmpeqb %%xmm1,%%xmm1 \n"
michael@0 3306 "psrld $0x8,%%xmm1 \n"
michael@0 3307 LABELALIGN
michael@0 3308 "1: \n"
michael@0 3309 "movdqa " MEMACCESS(0) ",%%xmm2 \n"
michael@0 3310 "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n"
michael@0 3311 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 3312 "movdqa " MEMACCESS(1) ",%%xmm4 \n"
michael@0 3313 "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n"
michael@0 3314 "pand %%xmm0,%%xmm2 \n"
michael@0 3315 "pand %%xmm0,%%xmm3 \n"
michael@0 3316 "pand %%xmm1,%%xmm4 \n"
michael@0 3317 "pand %%xmm1,%%xmm5 \n"
michael@0 3318 "por %%xmm4,%%xmm2 \n"
michael@0 3319 "por %%xmm5,%%xmm3 \n"
michael@0 3320 "movdqa %%xmm2," MEMACCESS(1) " \n"
michael@0 3321 "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
michael@0 3322 "lea " MEMLEA(0x20,1) ",%1 \n"
michael@0 3323 "sub $0x8,%2 \n"
michael@0 3324 "jg 1b \n"
michael@0 3325 : "+r"(src), // %0
michael@0 3326 "+r"(dst), // %1
michael@0 3327 "+r"(width) // %2
michael@0 3328 :
michael@0 3329 : "memory", "cc"
michael@0 3330 #if defined(__SSE2__)
michael@0 3331 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 3332 #endif
michael@0 3333 );
michael@0 3334 }
michael@0 3335 #endif // HAS_ARGBCOPYALPHAROW_SSE2
michael@0 3336
michael@0 3337 #ifdef HAS_ARGBCOPYALPHAROW_AVX2
michael@0 3338 // width in pixels
michael@0 3339 void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
michael@0 3340 asm volatile (
michael@0 3341 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
michael@0 3342 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
michael@0 3343 LABELALIGN
michael@0 3344 "1: \n"
michael@0 3345 "vmovdqu " MEMACCESS(0) ",%%ymm1 \n"
michael@0 3346 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n"
michael@0 3347 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 3348 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
michael@0 3349 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
michael@0 3350 "vmovdqu %%ymm1," MEMACCESS(1) " \n"
michael@0 3351 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
michael@0 3352 "lea " MEMLEA(0x40,1) ",%1 \n"
michael@0 3353 "sub $0x10,%2 \n"
michael@0 3354 "jg 1b \n"
michael@0 3355 "vzeroupper \n"
michael@0 3356 : "+r"(src), // %0
michael@0 3357 "+r"(dst), // %1
michael@0 3358 "+r"(width) // %2
michael@0 3359 :
michael@0 3360 : "memory", "cc"
michael@0 3361 #if defined(__SSE2__)
michael@0 3362 , "xmm0", "xmm1", "xmm2"
michael@0 3363 #endif
michael@0 3364 );
michael@0 3365 }
michael@0 3366 #endif // HAS_ARGBCOPYALPHAROW_AVX2
michael@0 3367
michael@0 3368 #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
michael@0 3369 // width in pixels
michael@0 3370 void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
michael@0 3371 asm volatile (
michael@0 3372 "pcmpeqb %%xmm0,%%xmm0 \n"
michael@0 3373 "pslld $0x18,%%xmm0 \n"
michael@0 3374 "pcmpeqb %%xmm1,%%xmm1 \n"
michael@0 3375 "psrld $0x8,%%xmm1 \n"
michael@0 3376 LABELALIGN
michael@0 3377 "1: \n"
michael@0 3378 "movq " MEMACCESS(0) ",%%xmm2 \n"
michael@0 3379 "lea " MEMLEA(0x8,0) ",%0 \n"
michael@0 3380 "punpcklbw %%xmm2,%%xmm2 \n"
michael@0 3381 "punpckhwd %%xmm2,%%xmm3 \n"
michael@0 3382 "punpcklwd %%xmm2,%%xmm2 \n"
michael@0 3383 "movdqa " MEMACCESS(1) ",%%xmm4 \n"
michael@0 3384 "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n"
michael@0 3385 "pand %%xmm0,%%xmm2 \n"
michael@0 3386 "pand %%xmm0,%%xmm3 \n"
michael@0 3387 "pand %%xmm1,%%xmm4 \n"
michael@0 3388 "pand %%xmm1,%%xmm5 \n"
michael@0 3389 "por %%xmm4,%%xmm2 \n"
michael@0 3390 "por %%xmm5,%%xmm3 \n"
michael@0 3391 "movdqa %%xmm2," MEMACCESS(1) " \n"
michael@0 3392 "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
michael@0 3393 "lea " MEMLEA(0x20,1) ",%1 \n"
michael@0 3394 "sub $0x8,%2 \n"
michael@0 3395 "jg 1b \n"
michael@0 3396 : "+r"(src), // %0
michael@0 3397 "+r"(dst), // %1
michael@0 3398 "+r"(width) // %2
michael@0 3399 :
michael@0 3400 : "memory", "cc"
michael@0 3401 #if defined(__SSE2__)
michael@0 3402 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 3403 #endif
michael@0 3404 );
michael@0 3405 }
michael@0 3406 #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2
michael@0 3407
michael@0 3408 #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
michael@0 3409 // width in pixels
michael@0 3410 void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
michael@0 3411 asm volatile (
michael@0 3412 "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n"
michael@0 3413 "vpsrld $0x8,%%ymm0,%%ymm0 \n"
michael@0 3414 LABELALIGN
michael@0 3415 "1: \n"
michael@0 3416 "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n"
michael@0 3417 "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n"
michael@0 3418 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 3419 "vpslld $0x18,%%ymm1,%%ymm1 \n"
michael@0 3420 "vpslld $0x18,%%ymm2,%%ymm2 \n"
michael@0 3421 "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n"
michael@0 3422 "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n"
michael@0 3423 "vmovdqu %%ymm1," MEMACCESS(1) " \n"
michael@0 3424 "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n"
michael@0 3425 "lea " MEMLEA(0x40,1) ",%1 \n"
michael@0 3426 "sub $0x10,%2 \n"
michael@0 3427 "jg 1b \n"
michael@0 3428 "vzeroupper \n"
michael@0 3429 : "+r"(src), // %0
michael@0 3430 "+r"(dst), // %1
michael@0 3431 "+r"(width) // %2
michael@0 3432 :
michael@0 3433 : "memory", "cc"
michael@0 3434 #if defined(__SSE2__)
michael@0 3435 , "xmm0", "xmm1", "xmm2"
michael@0 3436 #endif
michael@0 3437 );
michael@0 3438 }
michael@0 3439 #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2
michael@0 3440
michael@0 3441 #ifdef HAS_SETROW_X86
michael@0 3442 void SetRow_X86(uint8* dst, uint32 v32, int width) {
michael@0 3443 size_t width_tmp = (size_t)(width);
michael@0 3444 asm volatile (
michael@0 3445 "shr $0x2,%1 \n"
michael@0 3446 "rep stosl " MEMSTORESTRING(eax,0) " \n"
michael@0 3447 : "+D"(dst), // %0
michael@0 3448 "+c"(width_tmp) // %1
michael@0 3449 : "a"(v32) // %2
michael@0 3450 : "memory", "cc");
michael@0 3451 }
michael@0 3452
michael@0 3453 void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
michael@0 3454 int dst_stride, int height) {
michael@0 3455 for (int y = 0; y < height; ++y) {
michael@0 3456 size_t width_tmp = (size_t)(width);
michael@0 3457 uint32* d = (uint32*)(dst);
michael@0 3458 asm volatile (
michael@0 3459 "rep stosl " MEMSTORESTRING(eax,0) " \n"
michael@0 3460 : "+D"(d), // %0
michael@0 3461 "+c"(width_tmp) // %1
michael@0 3462 : "a"(v32) // %2
michael@0 3463 : "memory", "cc");
michael@0 3464 dst += dst_stride;
michael@0 3465 }
michael@0 3466 }
michael@0 3467 #endif // HAS_SETROW_X86
michael@0 3468
michael@0 3469 #ifdef HAS_YUY2TOYROW_SSE2
michael@0 3470 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
michael@0 3471 asm volatile (
michael@0 3472 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 3473 "psrlw $0x8,%%xmm5 \n"
michael@0 3474 LABELALIGN
michael@0 3475 "1: \n"
michael@0 3476 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3477 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 3478 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 3479 "pand %%xmm5,%%xmm0 \n"
michael@0 3480 "pand %%xmm5,%%xmm1 \n"
michael@0 3481 "packuswb %%xmm1,%%xmm0 \n"
michael@0 3482 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 3483 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 3484 "sub $0x10,%2 \n"
michael@0 3485 "jg 1b \n"
michael@0 3486 : "+r"(src_yuy2), // %0
michael@0 3487 "+r"(dst_y), // %1
michael@0 3488 "+r"(pix) // %2
michael@0 3489 :
michael@0 3490 : "memory", "cc"
michael@0 3491 #if defined(__SSE2__)
michael@0 3492 , "xmm0", "xmm1", "xmm5"
michael@0 3493 #endif
michael@0 3494 );
michael@0 3495 }
michael@0 3496
michael@0 3497 void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
michael@0 3498 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 3499 asm volatile (
michael@0 3500 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 3501 "psrlw $0x8,%%xmm5 \n"
michael@0 3502 "sub %1,%2 \n"
michael@0 3503 LABELALIGN
michael@0 3504 "1: \n"
michael@0 3505 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3506 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 3507 BUNDLEALIGN
michael@0 3508 MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
michael@0 3509 MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
michael@0 3510 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 3511 "pavgb %%xmm2,%%xmm0 \n"
michael@0 3512 "pavgb %%xmm3,%%xmm1 \n"
michael@0 3513 "psrlw $0x8,%%xmm0 \n"
michael@0 3514 "psrlw $0x8,%%xmm1 \n"
michael@0 3515 "packuswb %%xmm1,%%xmm0 \n"
michael@0 3516 "movdqa %%xmm0,%%xmm1 \n"
michael@0 3517 "pand %%xmm5,%%xmm0 \n"
michael@0 3518 "packuswb %%xmm0,%%xmm0 \n"
michael@0 3519 "psrlw $0x8,%%xmm1 \n"
michael@0 3520 "packuswb %%xmm1,%%xmm1 \n"
michael@0 3521 "movq %%xmm0," MEMACCESS(1) " \n"
michael@0 3522 BUNDLEALIGN
michael@0 3523 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
michael@0 3524 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 3525 "sub $0x10,%3 \n"
michael@0 3526 "jg 1b \n"
michael@0 3527 : "+r"(src_yuy2), // %0
michael@0 3528 "+r"(dst_u), // %1
michael@0 3529 "+r"(dst_v), // %2
michael@0 3530 "+r"(pix) // %3
michael@0 3531 : "r"((intptr_t)(stride_yuy2)) // %4
michael@0 3532 : "memory", "cc"
michael@0 3533 #if defined(__native_client__) && defined(__x86_64__)
michael@0 3534 , "r14"
michael@0 3535 #endif
michael@0 3536 #if defined(__SSE2__)
michael@0 3537 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
michael@0 3538 #endif
michael@0 3539 );
michael@0 3540 }
michael@0 3541
michael@0 3542 void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
michael@0 3543 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 3544 asm volatile (
michael@0 3545 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 3546 "psrlw $0x8,%%xmm5 \n"
michael@0 3547 "sub %1,%2 \n"
michael@0 3548 LABELALIGN
michael@0 3549 "1: \n"
michael@0 3550 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3551 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 3552 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 3553 "psrlw $0x8,%%xmm0 \n"
michael@0 3554 "psrlw $0x8,%%xmm1 \n"
michael@0 3555 "packuswb %%xmm1,%%xmm0 \n"
michael@0 3556 "movdqa %%xmm0,%%xmm1 \n"
michael@0 3557 "pand %%xmm5,%%xmm0 \n"
michael@0 3558 "packuswb %%xmm0,%%xmm0 \n"
michael@0 3559 "psrlw $0x8,%%xmm1 \n"
michael@0 3560 "packuswb %%xmm1,%%xmm1 \n"
michael@0 3561 "movq %%xmm0," MEMACCESS(1) " \n"
michael@0 3562 BUNDLEALIGN
michael@0 3563 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
michael@0 3564 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 3565 "sub $0x10,%3 \n"
michael@0 3566 "jg 1b \n"
michael@0 3567 : "+r"(src_yuy2), // %0
michael@0 3568 "+r"(dst_u), // %1
michael@0 3569 "+r"(dst_v), // %2
michael@0 3570 "+r"(pix) // %3
michael@0 3571 :
michael@0 3572 : "memory", "cc"
michael@0 3573 #if defined(__native_client__) && defined(__x86_64__)
michael@0 3574 , "r14"
michael@0 3575 #endif
michael@0 3576 #if defined(__SSE2__)
michael@0 3577 , "xmm0", "xmm1", "xmm5"
michael@0 3578 #endif
michael@0 3579 );
michael@0 3580 }
michael@0 3581
michael@0 3582 void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
michael@0 3583 uint8* dst_y, int pix) {
michael@0 3584 asm volatile (
michael@0 3585 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 3586 "psrlw $0x8,%%xmm5 \n"
michael@0 3587 LABELALIGN
michael@0 3588 "1: \n"
michael@0 3589 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3590 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 3591 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 3592 "pand %%xmm5,%%xmm0 \n"
michael@0 3593 "pand %%xmm5,%%xmm1 \n"
michael@0 3594 "packuswb %%xmm1,%%xmm0 \n"
michael@0 3595 "sub $0x10,%2 \n"
michael@0 3596 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 3597 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 3598 "jg 1b \n"
michael@0 3599 : "+r"(src_yuy2), // %0
michael@0 3600 "+r"(dst_y), // %1
michael@0 3601 "+r"(pix) // %2
michael@0 3602 :
michael@0 3603 : "memory", "cc"
michael@0 3604 #if defined(__SSE2__)
michael@0 3605 , "xmm0", "xmm1", "xmm5"
michael@0 3606 #endif
michael@0 3607 );
michael@0 3608 }
michael@0 3609
michael@0 3610 void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
michael@0 3611 int stride_yuy2,
michael@0 3612 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 3613 asm volatile (
michael@0 3614 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 3615 "psrlw $0x8,%%xmm5 \n"
michael@0 3616 "sub %1,%2 \n"
michael@0 3617 LABELALIGN
michael@0 3618 "1: \n"
michael@0 3619 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3620 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 3621 BUNDLEALIGN
michael@0 3622 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
michael@0 3623 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
michael@0 3624 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 3625 "pavgb %%xmm2,%%xmm0 \n"
michael@0 3626 "pavgb %%xmm3,%%xmm1 \n"
michael@0 3627 "psrlw $0x8,%%xmm0 \n"
michael@0 3628 "psrlw $0x8,%%xmm1 \n"
michael@0 3629 "packuswb %%xmm1,%%xmm0 \n"
michael@0 3630 "movdqa %%xmm0,%%xmm1 \n"
michael@0 3631 "pand %%xmm5,%%xmm0 \n"
michael@0 3632 "packuswb %%xmm0,%%xmm0 \n"
michael@0 3633 "psrlw $0x8,%%xmm1 \n"
michael@0 3634 "packuswb %%xmm1,%%xmm1 \n"
michael@0 3635 "movq %%xmm0," MEMACCESS(1) " \n"
michael@0 3636 BUNDLEALIGN
michael@0 3637 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
michael@0 3638 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 3639 "sub $0x10,%3 \n"
michael@0 3640 "jg 1b \n"
michael@0 3641 : "+r"(src_yuy2), // %0
michael@0 3642 "+r"(dst_u), // %1
michael@0 3643 "+r"(dst_v), // %2
michael@0 3644 "+r"(pix) // %3
michael@0 3645 : "r"((intptr_t)(stride_yuy2)) // %4
michael@0 3646 : "memory", "cc"
michael@0 3647 #if defined(__native_client__) && defined(__x86_64__)
michael@0 3648 , "r14"
michael@0 3649 #endif
michael@0 3650 #if defined(__SSE2__)
michael@0 3651 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
michael@0 3652 #endif
michael@0 3653 );
michael@0 3654 }
michael@0 3655
michael@0 3656 void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
michael@0 3657 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 3658 asm volatile (
michael@0 3659 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 3660 "psrlw $0x8,%%xmm5 \n"
michael@0 3661 "sub %1,%2 \n"
michael@0 3662 LABELALIGN
michael@0 3663 "1: \n"
michael@0 3664 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3665 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 3666 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 3667 "psrlw $0x8,%%xmm0 \n"
michael@0 3668 "psrlw $0x8,%%xmm1 \n"
michael@0 3669 "packuswb %%xmm1,%%xmm0 \n"
michael@0 3670 "movdqa %%xmm0,%%xmm1 \n"
michael@0 3671 "pand %%xmm5,%%xmm0 \n"
michael@0 3672 "packuswb %%xmm0,%%xmm0 \n"
michael@0 3673 "psrlw $0x8,%%xmm1 \n"
michael@0 3674 "packuswb %%xmm1,%%xmm1 \n"
michael@0 3675 "movq %%xmm0," MEMACCESS(1) " \n"
michael@0 3676 BUNDLEALIGN
michael@0 3677 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
michael@0 3678 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 3679 "sub $0x10,%3 \n"
michael@0 3680 "jg 1b \n"
michael@0 3681 : "+r"(src_yuy2), // %0
michael@0 3682 "+r"(dst_u), // %1
michael@0 3683 "+r"(dst_v), // %2
michael@0 3684 "+r"(pix) // %3
michael@0 3685 :
michael@0 3686 : "memory", "cc"
michael@0 3687 #if defined(__native_client__) && defined(__x86_64__)
michael@0 3688 , "r14"
michael@0 3689 #endif
michael@0 3690 #if defined(__SSE2__)
michael@0 3691 , "xmm0", "xmm1", "xmm5"
michael@0 3692 #endif
michael@0 3693 );
michael@0 3694 }
michael@0 3695
michael@0 3696 void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
michael@0 3697 asm volatile (
michael@0 3698 LABELALIGN
michael@0 3699 "1: \n"
michael@0 3700 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3701 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 3702 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 3703 "psrlw $0x8,%%xmm0 \n"
michael@0 3704 "psrlw $0x8,%%xmm1 \n"
michael@0 3705 "packuswb %%xmm1,%%xmm0 \n"
michael@0 3706 "sub $0x10,%2 \n"
michael@0 3707 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 3708 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 3709 "jg 1b \n"
michael@0 3710 : "+r"(src_uyvy), // %0
michael@0 3711 "+r"(dst_y), // %1
michael@0 3712 "+r"(pix) // %2
michael@0 3713 :
michael@0 3714 : "memory", "cc"
michael@0 3715 #if defined(__SSE2__)
michael@0 3716 , "xmm0", "xmm1"
michael@0 3717 #endif
michael@0 3718 );
michael@0 3719 }
michael@0 3720
michael@0 3721 void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
michael@0 3722 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 3723 asm volatile (
michael@0 3724 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 3725 "psrlw $0x8,%%xmm5 \n"
michael@0 3726 "sub %1,%2 \n"
michael@0 3727 LABELALIGN
michael@0 3728 "1: \n"
michael@0 3729 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3730 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 3731 BUNDLEALIGN
michael@0 3732 MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
michael@0 3733 MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
michael@0 3734 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 3735 "pavgb %%xmm2,%%xmm0 \n"
michael@0 3736 "pavgb %%xmm3,%%xmm1 \n"
michael@0 3737 "pand %%xmm5,%%xmm0 \n"
michael@0 3738 "pand %%xmm5,%%xmm1 \n"
michael@0 3739 "packuswb %%xmm1,%%xmm0 \n"
michael@0 3740 "movdqa %%xmm0,%%xmm1 \n"
michael@0 3741 "pand %%xmm5,%%xmm0 \n"
michael@0 3742 "packuswb %%xmm0,%%xmm0 \n"
michael@0 3743 "psrlw $0x8,%%xmm1 \n"
michael@0 3744 "packuswb %%xmm1,%%xmm1 \n"
michael@0 3745 "movq %%xmm0," MEMACCESS(1) " \n"
michael@0 3746 BUNDLEALIGN
michael@0 3747 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
michael@0 3748 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 3749 "sub $0x10,%3 \n"
michael@0 3750 "jg 1b \n"
michael@0 3751 : "+r"(src_uyvy), // %0
michael@0 3752 "+r"(dst_u), // %1
michael@0 3753 "+r"(dst_v), // %2
michael@0 3754 "+r"(pix) // %3
michael@0 3755 : "r"((intptr_t)(stride_uyvy)) // %4
michael@0 3756 : "memory", "cc"
michael@0 3757 #if defined(__native_client__) && defined(__x86_64__)
michael@0 3758 , "r14"
michael@0 3759 #endif
michael@0 3760 #if defined(__SSE2__)
michael@0 3761 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
michael@0 3762 #endif
michael@0 3763 );
michael@0 3764 }
michael@0 3765
michael@0 3766 void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
michael@0 3767 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 3768 asm volatile (
michael@0 3769 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 3770 "psrlw $0x8,%%xmm5 \n"
michael@0 3771 "sub %1,%2 \n"
michael@0 3772 LABELALIGN
michael@0 3773 "1: \n"
michael@0 3774 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3775 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 3776 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 3777 "pand %%xmm5,%%xmm0 \n"
michael@0 3778 "pand %%xmm5,%%xmm1 \n"
michael@0 3779 "packuswb %%xmm1,%%xmm0 \n"
michael@0 3780 "movdqa %%xmm0,%%xmm1 \n"
michael@0 3781 "pand %%xmm5,%%xmm0 \n"
michael@0 3782 "packuswb %%xmm0,%%xmm0 \n"
michael@0 3783 "psrlw $0x8,%%xmm1 \n"
michael@0 3784 "packuswb %%xmm1,%%xmm1 \n"
michael@0 3785 "movq %%xmm0," MEMACCESS(1) " \n"
michael@0 3786 BUNDLEALIGN
michael@0 3787 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
michael@0 3788 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 3789 "sub $0x10,%3 \n"
michael@0 3790 "jg 1b \n"
michael@0 3791 : "+r"(src_uyvy), // %0
michael@0 3792 "+r"(dst_u), // %1
michael@0 3793 "+r"(dst_v), // %2
michael@0 3794 "+r"(pix) // %3
michael@0 3795 :
michael@0 3796 : "memory", "cc"
michael@0 3797 #if defined(__native_client__) && defined(__x86_64__)
michael@0 3798 , "r14"
michael@0 3799 #endif
michael@0 3800 #if defined(__SSE2__)
michael@0 3801 , "xmm0", "xmm1", "xmm5"
michael@0 3802 #endif
michael@0 3803 );
michael@0 3804 }
michael@0 3805
michael@0 3806 void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
michael@0 3807 uint8* dst_y, int pix) {
michael@0 3808 asm volatile (
michael@0 3809 LABELALIGN
michael@0 3810 "1: \n"
michael@0 3811 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3812 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 3813 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 3814 "psrlw $0x8,%%xmm0 \n"
michael@0 3815 "psrlw $0x8,%%xmm1 \n"
michael@0 3816 "packuswb %%xmm1,%%xmm0 \n"
michael@0 3817 "sub $0x10,%2 \n"
michael@0 3818 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 3819 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 3820 "jg 1b \n"
michael@0 3821 : "+r"(src_uyvy), // %0
michael@0 3822 "+r"(dst_y), // %1
michael@0 3823 "+r"(pix) // %2
michael@0 3824 :
michael@0 3825 : "memory", "cc"
michael@0 3826 #if defined(__SSE2__)
michael@0 3827 , "xmm0", "xmm1"
michael@0 3828 #endif
michael@0 3829 );
michael@0 3830 }
michael@0 3831
michael@0 3832 void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
michael@0 3833 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 3834 asm volatile (
michael@0 3835 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 3836 "psrlw $0x8,%%xmm5 \n"
michael@0 3837 "sub %1,%2 \n"
michael@0 3838 LABELALIGN
michael@0 3839 "1: \n"
michael@0 3840 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3841 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 3842 BUNDLEALIGN
michael@0 3843 MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2
michael@0 3844 MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3
michael@0 3845 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 3846 "pavgb %%xmm2,%%xmm0 \n"
michael@0 3847 "pavgb %%xmm3,%%xmm1 \n"
michael@0 3848 "pand %%xmm5,%%xmm0 \n"
michael@0 3849 "pand %%xmm5,%%xmm1 \n"
michael@0 3850 "packuswb %%xmm1,%%xmm0 \n"
michael@0 3851 "movdqa %%xmm0,%%xmm1 \n"
michael@0 3852 "pand %%xmm5,%%xmm0 \n"
michael@0 3853 "packuswb %%xmm0,%%xmm0 \n"
michael@0 3854 "psrlw $0x8,%%xmm1 \n"
michael@0 3855 "packuswb %%xmm1,%%xmm1 \n"
michael@0 3856 "movq %%xmm0," MEMACCESS(1) " \n"
michael@0 3857 BUNDLEALIGN
michael@0 3858 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
michael@0 3859 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 3860 "sub $0x10,%3 \n"
michael@0 3861 "jg 1b \n"
michael@0 3862 : "+r"(src_uyvy), // %0
michael@0 3863 "+r"(dst_u), // %1
michael@0 3864 "+r"(dst_v), // %2
michael@0 3865 "+r"(pix) // %3
michael@0 3866 : "r"((intptr_t)(stride_uyvy)) // %4
michael@0 3867 : "memory", "cc"
michael@0 3868 #if defined(__native_client__) && defined(__x86_64__)
michael@0 3869 , "r14"
michael@0 3870 #endif
michael@0 3871 #if defined(__SSE2__)
michael@0 3872 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
michael@0 3873 #endif
michael@0 3874 );
michael@0 3875 }
michael@0 3876
michael@0 3877 void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
michael@0 3878 uint8* dst_u, uint8* dst_v, int pix) {
michael@0 3879 asm volatile (
michael@0 3880 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 3881 "psrlw $0x8,%%xmm5 \n"
michael@0 3882 "sub %1,%2 \n"
michael@0 3883 LABELALIGN
michael@0 3884 "1: \n"
michael@0 3885 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 3886 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 3887 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 3888 "pand %%xmm5,%%xmm0 \n"
michael@0 3889 "pand %%xmm5,%%xmm1 \n"
michael@0 3890 "packuswb %%xmm1,%%xmm0 \n"
michael@0 3891 "movdqa %%xmm0,%%xmm1 \n"
michael@0 3892 "pand %%xmm5,%%xmm0 \n"
michael@0 3893 "packuswb %%xmm0,%%xmm0 \n"
michael@0 3894 "psrlw $0x8,%%xmm1 \n"
michael@0 3895 "packuswb %%xmm1,%%xmm1 \n"
michael@0 3896 "movq %%xmm0," MEMACCESS(1) " \n"
michael@0 3897 BUNDLEALIGN
michael@0 3898 MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2)
michael@0 3899 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 3900 "sub $0x10,%3 \n"
michael@0 3901 "jg 1b \n"
michael@0 3902 : "+r"(src_uyvy), // %0
michael@0 3903 "+r"(dst_u), // %1
michael@0 3904 "+r"(dst_v), // %2
michael@0 3905 "+r"(pix) // %3
michael@0 3906 :
michael@0 3907 : "memory", "cc"
michael@0 3908 #if defined(__native_client__) && defined(__x86_64__)
michael@0 3909 , "r14"
michael@0 3910 #endif
michael@0 3911 #if defined(__SSE2__)
michael@0 3912 , "xmm0", "xmm1", "xmm5"
michael@0 3913 #endif
michael@0 3914 );
michael@0 3915 }
michael@0 3916 #endif // HAS_YUY2TOYROW_SSE2
michael@0 3917
michael@0 3918 #ifdef HAS_ARGBBLENDROW_SSE2
michael@0 3919 // Blend 8 pixels at a time.
michael@0 3920 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
michael@0 3921 uint8* dst_argb, int width) {
michael@0 3922 asm volatile (
michael@0 3923 "pcmpeqb %%xmm7,%%xmm7 \n"
michael@0 3924 "psrlw $0xf,%%xmm7 \n"
michael@0 3925 "pcmpeqb %%xmm6,%%xmm6 \n"
michael@0 3926 "psrlw $0x8,%%xmm6 \n"
michael@0 3927 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 3928 "psllw $0x8,%%xmm5 \n"
michael@0 3929 "pcmpeqb %%xmm4,%%xmm4 \n"
michael@0 3930 "pslld $0x18,%%xmm4 \n"
michael@0 3931 "sub $0x1,%3 \n"
michael@0 3932 "je 91f \n"
michael@0 3933 "jl 99f \n"
michael@0 3934
michael@0 3935 // 1 pixel loop until destination pointer is aligned.
michael@0 3936 "10: \n"
michael@0 3937 "test $0xf,%2 \n"
michael@0 3938 "je 19f \n"
michael@0 3939 "movd " MEMACCESS(0) ",%%xmm3 \n"
michael@0 3940 "lea " MEMLEA(0x4,0) ",%0 \n"
michael@0 3941 "movdqa %%xmm3,%%xmm0 \n"
michael@0 3942 "pxor %%xmm4,%%xmm3 \n"
michael@0 3943 "movd " MEMACCESS(1) ",%%xmm2 \n"
michael@0 3944 "psrlw $0x8,%%xmm3 \n"
michael@0 3945 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
michael@0 3946 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
michael@0 3947 "pand %%xmm6,%%xmm2 \n"
michael@0 3948 "paddw %%xmm7,%%xmm3 \n"
michael@0 3949 "pmullw %%xmm3,%%xmm2 \n"
michael@0 3950 "movd " MEMACCESS(1) ",%%xmm1 \n"
michael@0 3951 "lea " MEMLEA(0x4,1) ",%1 \n"
michael@0 3952 "psrlw $0x8,%%xmm1 \n"
michael@0 3953 "por %%xmm4,%%xmm0 \n"
michael@0 3954 "pmullw %%xmm3,%%xmm1 \n"
michael@0 3955 "psrlw $0x8,%%xmm2 \n"
michael@0 3956 "paddusb %%xmm2,%%xmm0 \n"
michael@0 3957 "pand %%xmm5,%%xmm1 \n"
michael@0 3958 "paddusb %%xmm1,%%xmm0 \n"
michael@0 3959 "sub $0x1,%3 \n"
michael@0 3960 "movd %%xmm0," MEMACCESS(2) " \n"
michael@0 3961 "lea " MEMLEA(0x4,2) ",%2 \n"
michael@0 3962 "jge 10b \n"
michael@0 3963
michael@0 3964 "19: \n"
michael@0 3965 "add $1-4,%3 \n"
michael@0 3966 "jl 49f \n"
michael@0 3967
michael@0 3968 // 4 pixel loop.
michael@0 3969 LABELALIGN
michael@0 3970 "41: \n"
michael@0 3971 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
michael@0 3972 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 3973 "movdqa %%xmm3,%%xmm0 \n"
michael@0 3974 "pxor %%xmm4,%%xmm3 \n"
michael@0 3975 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
michael@0 3976 "psrlw $0x8,%%xmm3 \n"
michael@0 3977 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
michael@0 3978 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
michael@0 3979 "pand %%xmm6,%%xmm2 \n"
michael@0 3980 "paddw %%xmm7,%%xmm3 \n"
michael@0 3981 "pmullw %%xmm3,%%xmm2 \n"
michael@0 3982 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
michael@0 3983 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 3984 "psrlw $0x8,%%xmm1 \n"
michael@0 3985 "por %%xmm4,%%xmm0 \n"
michael@0 3986 "pmullw %%xmm3,%%xmm1 \n"
michael@0 3987 "psrlw $0x8,%%xmm2 \n"
michael@0 3988 "paddusb %%xmm2,%%xmm0 \n"
michael@0 3989 "pand %%xmm5,%%xmm1 \n"
michael@0 3990 "paddusb %%xmm1,%%xmm0 \n"
michael@0 3991 "sub $0x4,%3 \n"
michael@0 3992 "movdqa %%xmm0," MEMACCESS(2) " \n"
michael@0 3993 "lea " MEMLEA(0x10,2) ",%2 \n"
michael@0 3994 "jge 41b \n"
michael@0 3995
michael@0 3996 "49: \n"
michael@0 3997 "add $0x3,%3 \n"
michael@0 3998 "jl 99f \n"
michael@0 3999
michael@0 4000 // 1 pixel loop.
michael@0 4001 "91: \n"
michael@0 4002 "movd " MEMACCESS(0) ",%%xmm3 \n"
michael@0 4003 "lea " MEMLEA(0x4,0) ",%0 \n"
michael@0 4004 "movdqa %%xmm3,%%xmm0 \n"
michael@0 4005 "pxor %%xmm4,%%xmm3 \n"
michael@0 4006 "movd " MEMACCESS(1) ",%%xmm2 \n"
michael@0 4007 "psrlw $0x8,%%xmm3 \n"
michael@0 4008 "pshufhw $0xf5,%%xmm3,%%xmm3 \n"
michael@0 4009 "pshuflw $0xf5,%%xmm3,%%xmm3 \n"
michael@0 4010 "pand %%xmm6,%%xmm2 \n"
michael@0 4011 "paddw %%xmm7,%%xmm3 \n"
michael@0 4012 "pmullw %%xmm3,%%xmm2 \n"
michael@0 4013 "movd " MEMACCESS(1) ",%%xmm1 \n"
michael@0 4014 "lea " MEMLEA(0x4,1) ",%1 \n"
michael@0 4015 "psrlw $0x8,%%xmm1 \n"
michael@0 4016 "por %%xmm4,%%xmm0 \n"
michael@0 4017 "pmullw %%xmm3,%%xmm1 \n"
michael@0 4018 "psrlw $0x8,%%xmm2 \n"
michael@0 4019 "paddusb %%xmm2,%%xmm0 \n"
michael@0 4020 "pand %%xmm5,%%xmm1 \n"
michael@0 4021 "paddusb %%xmm1,%%xmm0 \n"
michael@0 4022 "sub $0x1,%3 \n"
michael@0 4023 "movd %%xmm0," MEMACCESS(2) " \n"
michael@0 4024 "lea " MEMLEA(0x4,2) ",%2 \n"
michael@0 4025 "jge 91b \n"
michael@0 4026 "99: \n"
michael@0 4027 : "+r"(src_argb0), // %0
michael@0 4028 "+r"(src_argb1), // %1
michael@0 4029 "+r"(dst_argb), // %2
michael@0 4030 "+r"(width) // %3
michael@0 4031 :
michael@0 4032 : "memory", "cc"
michael@0 4033 #if defined(__SSE2__)
michael@0 4034 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
michael@0 4035 #endif
michael@0 4036 );
michael@0 4037 }
michael@0 4038 #endif // HAS_ARGBBLENDROW_SSE2
michael@0 4039
michael@0 4040 #ifdef HAS_ARGBBLENDROW_SSSE3
michael@0 4041 // Shuffle table for isolating alpha.
michael@0 4042 static uvec8 kShuffleAlpha = {
michael@0 4043 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
michael@0 4044 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
michael@0 4045 };
michael@0 4046
michael@0 4047 // Blend 8 pixels at a time
michael@0 4048 // Shuffle table for reversing the bytes.
michael@0 4049
michael@0 4050 // Same as SSE2, but replaces
michael@0 4051 // psrlw xmm3, 8 // alpha
michael@0 4052 // pshufhw xmm3, xmm3,0F5h // 8 alpha words
michael@0 4053 // pshuflw xmm3, xmm3,0F5h
michael@0 4054 // with..
michael@0 4055 // pshufb xmm3, kShuffleAlpha // alpha
michael@0 4056
michael@0 4057 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
michael@0 4058 uint8* dst_argb, int width) {
michael@0 4059 asm volatile (
michael@0 4060 "pcmpeqb %%xmm7,%%xmm7 \n"
michael@0 4061 "psrlw $0xf,%%xmm7 \n"
michael@0 4062 "pcmpeqb %%xmm6,%%xmm6 \n"
michael@0 4063 "psrlw $0x8,%%xmm6 \n"
michael@0 4064 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 4065 "psllw $0x8,%%xmm5 \n"
michael@0 4066 "pcmpeqb %%xmm4,%%xmm4 \n"
michael@0 4067 "pslld $0x18,%%xmm4 \n"
michael@0 4068 "sub $0x1,%3 \n"
michael@0 4069 "je 91f \n"
michael@0 4070 "jl 99f \n"
michael@0 4071
michael@0 4072 // 1 pixel loop until destination pointer is aligned.
michael@0 4073 "10: \n"
michael@0 4074 "test $0xf,%2 \n"
michael@0 4075 "je 19f \n"
michael@0 4076 "movd " MEMACCESS(0) ",%%xmm3 \n"
michael@0 4077 "lea " MEMLEA(0x4,0) ",%0 \n"
michael@0 4078 "movdqa %%xmm3,%%xmm0 \n"
michael@0 4079 "pxor %%xmm4,%%xmm3 \n"
michael@0 4080 "movd " MEMACCESS(1) ",%%xmm2 \n"
michael@0 4081 "pshufb %4,%%xmm3 \n"
michael@0 4082 "pand %%xmm6,%%xmm2 \n"
michael@0 4083 "paddw %%xmm7,%%xmm3 \n"
michael@0 4084 "pmullw %%xmm3,%%xmm2 \n"
michael@0 4085 "movd " MEMACCESS(1) ",%%xmm1 \n"
michael@0 4086 "lea " MEMLEA(0x4,1) ",%1 \n"
michael@0 4087 "psrlw $0x8,%%xmm1 \n"
michael@0 4088 "por %%xmm4,%%xmm0 \n"
michael@0 4089 "pmullw %%xmm3,%%xmm1 \n"
michael@0 4090 "psrlw $0x8,%%xmm2 \n"
michael@0 4091 "paddusb %%xmm2,%%xmm0 \n"
michael@0 4092 "pand %%xmm5,%%xmm1 \n"
michael@0 4093 "paddusb %%xmm1,%%xmm0 \n"
michael@0 4094 "sub $0x1,%3 \n"
michael@0 4095 "movd %%xmm0," MEMACCESS(2) " \n"
michael@0 4096 "lea " MEMLEA(0x4,2) ",%2 \n"
michael@0 4097 "jge 10b \n"
michael@0 4098
michael@0 4099 "19: \n"
michael@0 4100 "add $1-4,%3 \n"
michael@0 4101 "jl 49f \n"
michael@0 4102 "test $0xf,%0 \n"
michael@0 4103 "jne 41f \n"
michael@0 4104 "test $0xf,%1 \n"
michael@0 4105 "jne 41f \n"
michael@0 4106
michael@0 4107 // 4 pixel loop.
michael@0 4108 LABELALIGN
michael@0 4109 "40: \n"
michael@0 4110 "movdqa " MEMACCESS(0) ",%%xmm3 \n"
michael@0 4111 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 4112 "movdqa %%xmm3,%%xmm0 \n"
michael@0 4113 "pxor %%xmm4,%%xmm3 \n"
michael@0 4114 "movdqa " MEMACCESS(1) ",%%xmm2 \n"
michael@0 4115 "pshufb %4,%%xmm3 \n"
michael@0 4116 "pand %%xmm6,%%xmm2 \n"
michael@0 4117 "paddw %%xmm7,%%xmm3 \n"
michael@0 4118 "pmullw %%xmm3,%%xmm2 \n"
michael@0 4119 "movdqa " MEMACCESS(1) ",%%xmm1 \n"
michael@0 4120 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 4121 "psrlw $0x8,%%xmm1 \n"
michael@0 4122 "por %%xmm4,%%xmm0 \n"
michael@0 4123 "pmullw %%xmm3,%%xmm1 \n"
michael@0 4124 "psrlw $0x8,%%xmm2 \n"
michael@0 4125 "paddusb %%xmm2,%%xmm0 \n"
michael@0 4126 "pand %%xmm5,%%xmm1 \n"
michael@0 4127 "paddusb %%xmm1,%%xmm0 \n"
michael@0 4128 "sub $0x4,%3 \n"
michael@0 4129 "movdqa %%xmm0," MEMACCESS(2) " \n"
michael@0 4130 "lea " MEMLEA(0x10,2) ",%2 \n"
michael@0 4131 "jge 40b \n"
michael@0 4132 "jmp 49f \n"
michael@0 4133
michael@0 4134 // 4 pixel unaligned loop.
michael@0 4135 LABELALIGN
michael@0 4136 "41: \n"
michael@0 4137 "movdqu " MEMACCESS(0) ",%%xmm3 \n"
michael@0 4138 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 4139 "movdqa %%xmm3,%%xmm0 \n"
michael@0 4140 "pxor %%xmm4,%%xmm3 \n"
michael@0 4141 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
michael@0 4142 "pshufb %4,%%xmm3 \n"
michael@0 4143 "pand %%xmm6,%%xmm2 \n"
michael@0 4144 "paddw %%xmm7,%%xmm3 \n"
michael@0 4145 "pmullw %%xmm3,%%xmm2 \n"
michael@0 4146 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
michael@0 4147 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 4148 "psrlw $0x8,%%xmm1 \n"
michael@0 4149 "por %%xmm4,%%xmm0 \n"
michael@0 4150 "pmullw %%xmm3,%%xmm1 \n"
michael@0 4151 "psrlw $0x8,%%xmm2 \n"
michael@0 4152 "paddusb %%xmm2,%%xmm0 \n"
michael@0 4153 "pand %%xmm5,%%xmm1 \n"
michael@0 4154 "paddusb %%xmm1,%%xmm0 \n"
michael@0 4155 "sub $0x4,%3 \n"
michael@0 4156 "movdqa %%xmm0," MEMACCESS(2) " \n"
michael@0 4157 "lea " MEMLEA(0x10,2) ",%2 \n"
michael@0 4158 "jge 41b \n"
michael@0 4159
michael@0 4160 "49: \n"
michael@0 4161 "add $0x3,%3 \n"
michael@0 4162 "jl 99f \n"
michael@0 4163
michael@0 4164 // 1 pixel loop.
michael@0 4165 "91: \n"
michael@0 4166 "movd " MEMACCESS(0) ",%%xmm3 \n"
michael@0 4167 "lea " MEMLEA(0x4,0) ",%0 \n"
michael@0 4168 "movdqa %%xmm3,%%xmm0 \n"
michael@0 4169 "pxor %%xmm4,%%xmm3 \n"
michael@0 4170 "movd " MEMACCESS(1) ",%%xmm2 \n"
michael@0 4171 "pshufb %4,%%xmm3 \n"
michael@0 4172 "pand %%xmm6,%%xmm2 \n"
michael@0 4173 "paddw %%xmm7,%%xmm3 \n"
michael@0 4174 "pmullw %%xmm3,%%xmm2 \n"
michael@0 4175 "movd " MEMACCESS(1) ",%%xmm1 \n"
michael@0 4176 "lea " MEMLEA(0x4,1) ",%1 \n"
michael@0 4177 "psrlw $0x8,%%xmm1 \n"
michael@0 4178 "por %%xmm4,%%xmm0 \n"
michael@0 4179 "pmullw %%xmm3,%%xmm1 \n"
michael@0 4180 "psrlw $0x8,%%xmm2 \n"
michael@0 4181 "paddusb %%xmm2,%%xmm0 \n"
michael@0 4182 "pand %%xmm5,%%xmm1 \n"
michael@0 4183 "paddusb %%xmm1,%%xmm0 \n"
michael@0 4184 "sub $0x1,%3 \n"
michael@0 4185 "movd %%xmm0," MEMACCESS(2) " \n"
michael@0 4186 "lea " MEMLEA(0x4,2) ",%2 \n"
michael@0 4187 "jge 91b \n"
michael@0 4188 "99: \n"
michael@0 4189 : "+r"(src_argb0), // %0
michael@0 4190 "+r"(src_argb1), // %1
michael@0 4191 "+r"(dst_argb), // %2
michael@0 4192 "+r"(width) // %3
michael@0 4193 : "m"(kShuffleAlpha) // %4
michael@0 4194 : "memory", "cc"
michael@0 4195 #if defined(__SSE2__)
michael@0 4196 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
michael@0 4197 #endif
michael@0 4198 );
michael@0 4199 }
michael@0 4200 #endif // HAS_ARGBBLENDROW_SSSE3
michael@0 4201
michael@0 4202 #ifdef HAS_ARGBATTENUATEROW_SSE2
michael@0 4203 // Attenuate 4 pixels at a time.
michael@0 4204 // aligned to 16 bytes
michael@0 4205 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
michael@0 4206 asm volatile (
michael@0 4207 "pcmpeqb %%xmm4,%%xmm4 \n"
michael@0 4208 "pslld $0x18,%%xmm4 \n"
michael@0 4209 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 4210 "psrld $0x8,%%xmm5 \n"
michael@0 4211
michael@0 4212 // 4 pixel loop.
michael@0 4213 LABELALIGN
michael@0 4214 "1: \n"
michael@0 4215 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 4216 "punpcklbw %%xmm0,%%xmm0 \n"
michael@0 4217 "pshufhw $0xff,%%xmm0,%%xmm2 \n"
michael@0 4218 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
michael@0 4219 "pmulhuw %%xmm2,%%xmm0 \n"
michael@0 4220 "movdqa " MEMACCESS(0) ",%%xmm1 \n"
michael@0 4221 "punpckhbw %%xmm1,%%xmm1 \n"
michael@0 4222 "pshufhw $0xff,%%xmm1,%%xmm2 \n"
michael@0 4223 "pshuflw $0xff,%%xmm2,%%xmm2 \n"
michael@0 4224 "pmulhuw %%xmm2,%%xmm1 \n"
michael@0 4225 "movdqa " MEMACCESS(0) ",%%xmm2 \n"
michael@0 4226 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 4227 "psrlw $0x8,%%xmm0 \n"
michael@0 4228 "pand %%xmm4,%%xmm2 \n"
michael@0 4229 "psrlw $0x8,%%xmm1 \n"
michael@0 4230 "packuswb %%xmm1,%%xmm0 \n"
michael@0 4231 "pand %%xmm5,%%xmm0 \n"
michael@0 4232 "por %%xmm2,%%xmm0 \n"
michael@0 4233 "sub $0x4,%2 \n"
michael@0 4234 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 4235 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 4236 "jg 1b \n"
michael@0 4237 : "+r"(src_argb), // %0
michael@0 4238 "+r"(dst_argb), // %1
michael@0 4239 "+r"(width) // %2
michael@0 4240 :
michael@0 4241 : "memory", "cc"
michael@0 4242 #if defined(__SSE2__)
michael@0 4243 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 4244 #endif
michael@0 4245 );
michael@0 4246 }
michael@0 4247 #endif // HAS_ARGBATTENUATEROW_SSE2
michael@0 4248
michael@0 4249 #ifdef HAS_ARGBATTENUATEROW_SSSE3
michael@0 4250 // Shuffle table duplicating alpha
michael@0 4251 static uvec8 kShuffleAlpha0 = {
michael@0 4252 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
michael@0 4253 };
michael@0 4254 static uvec8 kShuffleAlpha1 = {
michael@0 4255 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
michael@0 4256 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
michael@0 4257 };
michael@0 4258 // Attenuate 4 pixels at a time.
michael@0 4259 // aligned to 16 bytes
michael@0 4260 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
michael@0 4261 asm volatile (
michael@0 4262 "pcmpeqb %%xmm3,%%xmm3 \n"
michael@0 4263 "pslld $0x18,%%xmm3 \n"
michael@0 4264 "movdqa %3,%%xmm4 \n"
michael@0 4265 "movdqa %4,%%xmm5 \n"
michael@0 4266
michael@0 4267 // 4 pixel loop.
michael@0 4268 LABELALIGN
michael@0 4269 "1: \n"
michael@0 4270 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 4271 "pshufb %%xmm4,%%xmm0 \n"
michael@0 4272 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
michael@0 4273 "punpcklbw %%xmm1,%%xmm1 \n"
michael@0 4274 "pmulhuw %%xmm1,%%xmm0 \n"
michael@0 4275 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
michael@0 4276 "pshufb %%xmm5,%%xmm1 \n"
michael@0 4277 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
michael@0 4278 "punpckhbw %%xmm2,%%xmm2 \n"
michael@0 4279 "pmulhuw %%xmm2,%%xmm1 \n"
michael@0 4280 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
michael@0 4281 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 4282 "pand %%xmm3,%%xmm2 \n"
michael@0 4283 "psrlw $0x8,%%xmm0 \n"
michael@0 4284 "psrlw $0x8,%%xmm1 \n"
michael@0 4285 "packuswb %%xmm1,%%xmm0 \n"
michael@0 4286 "por %%xmm2,%%xmm0 \n"
michael@0 4287 "sub $0x4,%2 \n"
michael@0 4288 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 4289 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 4290 "jg 1b \n"
michael@0 4291 : "+r"(src_argb), // %0
michael@0 4292 "+r"(dst_argb), // %1
michael@0 4293 "+r"(width) // %2
michael@0 4294 : "m"(kShuffleAlpha0), // %3
michael@0 4295 "m"(kShuffleAlpha1) // %4
michael@0 4296 : "memory", "cc"
michael@0 4297 #if defined(__SSE2__)
michael@0 4298 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 4299 #endif
michael@0 4300 );
michael@0 4301 }
michael@0 4302 #endif // HAS_ARGBATTENUATEROW_SSSE3
michael@0 4303
michael@0 4304 #ifdef HAS_ARGBUNATTENUATEROW_SSE2
michael@0 4305 // Unattenuate 4 pixels at a time.
michael@0 4306 // aligned to 16 bytes
michael@0 4307 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
michael@0 4308 int width) {
michael@0 4309 uintptr_t alpha = 0;
michael@0 4310 asm volatile (
michael@0 4311 // 4 pixel loop.
michael@0 4312 LABELALIGN
michael@0 4313 "1: \n"
michael@0 4314 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 4315 "movzb " MEMACCESS2(0x03,0) ",%3 \n"
michael@0 4316 "punpcklbw %%xmm0,%%xmm0 \n"
michael@0 4317 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
michael@0 4318 "movzb " MEMACCESS2(0x07,0) ",%3 \n"
michael@0 4319 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
michael@0 4320 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
michael@0 4321 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
michael@0 4322 "movlhps %%xmm3,%%xmm2 \n"
michael@0 4323 "pmulhuw %%xmm2,%%xmm0 \n"
michael@0 4324 "movdqu " MEMACCESS(0) ",%%xmm1 \n"
michael@0 4325 "movzb " MEMACCESS2(0x0b,0) ",%3 \n"
michael@0 4326 "punpckhbw %%xmm1,%%xmm1 \n"
michael@0 4327 BUNDLEALIGN
michael@0 4328 MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2
michael@0 4329 "movzb " MEMACCESS2(0x0f,0) ",%3 \n"
michael@0 4330 MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3
michael@0 4331 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
michael@0 4332 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
michael@0 4333 "movlhps %%xmm3,%%xmm2 \n"
michael@0 4334 "pmulhuw %%xmm2,%%xmm1 \n"
michael@0 4335 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 4336 "packuswb %%xmm1,%%xmm0 \n"
michael@0 4337 "sub $0x4,%2 \n"
michael@0 4338 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 4339 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 4340 "jg 1b \n"
michael@0 4341 : "+r"(src_argb), // %0
michael@0 4342 "+r"(dst_argb), // %1
michael@0 4343 "+r"(width), // %2
michael@0 4344 "+r"(alpha) // %3
michael@0 4345 : "r"(fixed_invtbl8) // %4
michael@0 4346 : "memory", "cc"
michael@0 4347 #if defined(__native_client__) && defined(__x86_64__)
michael@0 4348 , "r14"
michael@0 4349 #endif
michael@0 4350 #if defined(__SSE2__)
michael@0 4351 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 4352 #endif
michael@0 4353 );
michael@0 4354 }
michael@0 4355 #endif // HAS_ARGBUNATTENUATEROW_SSE2
michael@0 4356
michael@0 4357 #ifdef HAS_ARGBGRAYROW_SSSE3
michael@0 4358 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
michael@0 4359 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
michael@0 4360 asm volatile (
michael@0 4361 "movdqa %3,%%xmm4 \n"
michael@0 4362 "movdqa %4,%%xmm5 \n"
michael@0 4363
michael@0 4364 // 8 pixel loop.
michael@0 4365 LABELALIGN
michael@0 4366 "1: \n"
michael@0 4367 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 4368 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 4369 "pmaddubsw %%xmm4,%%xmm0 \n"
michael@0 4370 "pmaddubsw %%xmm4,%%xmm1 \n"
michael@0 4371 "phaddw %%xmm1,%%xmm0 \n"
michael@0 4372 "paddw %%xmm5,%%xmm0 \n"
michael@0 4373 "psrlw $0x7,%%xmm0 \n"
michael@0 4374 "packuswb %%xmm0,%%xmm0 \n"
michael@0 4375 "movdqa " MEMACCESS(0) ",%%xmm2 \n"
michael@0 4376 "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n"
michael@0 4377 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 4378 "psrld $0x18,%%xmm2 \n"
michael@0 4379 "psrld $0x18,%%xmm3 \n"
michael@0 4380 "packuswb %%xmm3,%%xmm2 \n"
michael@0 4381 "packuswb %%xmm2,%%xmm2 \n"
michael@0 4382 "movdqa %%xmm0,%%xmm3 \n"
michael@0 4383 "punpcklbw %%xmm0,%%xmm0 \n"
michael@0 4384 "punpcklbw %%xmm2,%%xmm3 \n"
michael@0 4385 "movdqa %%xmm0,%%xmm1 \n"
michael@0 4386 "punpcklwd %%xmm3,%%xmm0 \n"
michael@0 4387 "punpckhwd %%xmm3,%%xmm1 \n"
michael@0 4388 "sub $0x8,%2 \n"
michael@0 4389 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 4390 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
michael@0 4391 "lea " MEMLEA(0x20,1) ",%1 \n"
michael@0 4392 "jg 1b \n"
michael@0 4393 : "+r"(src_argb), // %0
michael@0 4394 "+r"(dst_argb), // %1
michael@0 4395 "+r"(width) // %2
michael@0 4396 : "m"(kARGBToYJ), // %3
michael@0 4397 "m"(kAddYJ64) // %4
michael@0 4398 : "memory", "cc"
michael@0 4399 #if defined(__SSE2__)
michael@0 4400 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 4401 #endif
michael@0 4402 );
michael@0 4403 }
michael@0 4404 #endif // HAS_ARGBGRAYROW_SSSE3
michael@0 4405
michael@0 4406 #ifdef HAS_ARGBSEPIAROW_SSSE3
michael@0 4407 // b = (r * 35 + g * 68 + b * 17) >> 7
michael@0 4408 // g = (r * 45 + g * 88 + b * 22) >> 7
michael@0 4409 // r = (r * 50 + g * 98 + b * 24) >> 7
michael@0 4410 // Constant for ARGB color to sepia tone
michael@0 4411 static vec8 kARGBToSepiaB = {
michael@0 4412 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
michael@0 4413 };
michael@0 4414
michael@0 4415 static vec8 kARGBToSepiaG = {
michael@0 4416 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
michael@0 4417 };
michael@0 4418
michael@0 4419 static vec8 kARGBToSepiaR = {
michael@0 4420 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
michael@0 4421 };
michael@0 4422
michael@0 4423 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
michael@0 4424 void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
michael@0 4425 asm volatile (
michael@0 4426 "movdqa %2,%%xmm2 \n"
michael@0 4427 "movdqa %3,%%xmm3 \n"
michael@0 4428 "movdqa %4,%%xmm4 \n"
michael@0 4429
michael@0 4430 // 8 pixel loop.
michael@0 4431 LABELALIGN
michael@0 4432 "1: \n"
michael@0 4433 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 4434 "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
michael@0 4435 "pmaddubsw %%xmm2,%%xmm0 \n"
michael@0 4436 "pmaddubsw %%xmm2,%%xmm6 \n"
michael@0 4437 "phaddw %%xmm6,%%xmm0 \n"
michael@0 4438 "psrlw $0x7,%%xmm0 \n"
michael@0 4439 "packuswb %%xmm0,%%xmm0 \n"
michael@0 4440 "movdqa " MEMACCESS(0) ",%%xmm5 \n"
michael@0 4441 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 4442 "pmaddubsw %%xmm3,%%xmm5 \n"
michael@0 4443 "pmaddubsw %%xmm3,%%xmm1 \n"
michael@0 4444 "phaddw %%xmm1,%%xmm5 \n"
michael@0 4445 "psrlw $0x7,%%xmm5 \n"
michael@0 4446 "packuswb %%xmm5,%%xmm5 \n"
michael@0 4447 "punpcklbw %%xmm5,%%xmm0 \n"
michael@0 4448 "movdqa " MEMACCESS(0) ",%%xmm5 \n"
michael@0 4449 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 4450 "pmaddubsw %%xmm4,%%xmm5 \n"
michael@0 4451 "pmaddubsw %%xmm4,%%xmm1 \n"
michael@0 4452 "phaddw %%xmm1,%%xmm5 \n"
michael@0 4453 "psrlw $0x7,%%xmm5 \n"
michael@0 4454 "packuswb %%xmm5,%%xmm5 \n"
michael@0 4455 "movdqa " MEMACCESS(0) ",%%xmm6 \n"
michael@0 4456 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 4457 "psrld $0x18,%%xmm6 \n"
michael@0 4458 "psrld $0x18,%%xmm1 \n"
michael@0 4459 "packuswb %%xmm1,%%xmm6 \n"
michael@0 4460 "packuswb %%xmm6,%%xmm6 \n"
michael@0 4461 "punpcklbw %%xmm6,%%xmm5 \n"
michael@0 4462 "movdqa %%xmm0,%%xmm1 \n"
michael@0 4463 "punpcklwd %%xmm5,%%xmm0 \n"
michael@0 4464 "punpckhwd %%xmm5,%%xmm1 \n"
michael@0 4465 "sub $0x8,%1 \n"
michael@0 4466 "movdqa %%xmm0," MEMACCESS(0) " \n"
michael@0 4467 "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
michael@0 4468 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 4469 "jg 1b \n"
michael@0 4470 : "+r"(dst_argb), // %0
michael@0 4471 "+r"(width) // %1
michael@0 4472 : "m"(kARGBToSepiaB), // %2
michael@0 4473 "m"(kARGBToSepiaG), // %3
michael@0 4474 "m"(kARGBToSepiaR) // %4
michael@0 4475 : "memory", "cc"
michael@0 4476 #if defined(__SSE2__)
michael@0 4477 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
michael@0 4478 #endif
michael@0 4479 );
michael@0 4480 }
michael@0 4481 #endif // HAS_ARGBSEPIAROW_SSSE3
michael@0 4482
michael@0 4483 #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
michael@0 4484 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
michael@0 4485 // Same as Sepia except matrix is provided.
michael@0 4486 void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
michael@0 4487 const int8* matrix_argb, int width) {
michael@0 4488 asm volatile (
michael@0 4489 "movdqu " MEMACCESS(3) ",%%xmm5 \n"
michael@0 4490 "pshufd $0x00,%%xmm5,%%xmm2 \n"
michael@0 4491 "pshufd $0x55,%%xmm5,%%xmm3 \n"
michael@0 4492 "pshufd $0xaa,%%xmm5,%%xmm4 \n"
michael@0 4493 "pshufd $0xff,%%xmm5,%%xmm5 \n"
michael@0 4494
michael@0 4495 // 8 pixel loop.
michael@0 4496 LABELALIGN
michael@0 4497 "1: \n"
michael@0 4498 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 4499 "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
michael@0 4500 "pmaddubsw %%xmm2,%%xmm0 \n"
michael@0 4501 "pmaddubsw %%xmm2,%%xmm7 \n"
michael@0 4502 "movdqa " MEMACCESS(0) ",%%xmm6 \n"
michael@0 4503 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 4504 "pmaddubsw %%xmm3,%%xmm6 \n"
michael@0 4505 "pmaddubsw %%xmm3,%%xmm1 \n"
michael@0 4506 "phaddsw %%xmm7,%%xmm0 \n"
michael@0 4507 "phaddsw %%xmm1,%%xmm6 \n"
michael@0 4508 "psraw $0x6,%%xmm0 \n"
michael@0 4509 "psraw $0x6,%%xmm6 \n"
michael@0 4510 "packuswb %%xmm0,%%xmm0 \n"
michael@0 4511 "packuswb %%xmm6,%%xmm6 \n"
michael@0 4512 "punpcklbw %%xmm6,%%xmm0 \n"
michael@0 4513 "movdqa " MEMACCESS(0) ",%%xmm1 \n"
michael@0 4514 "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
michael@0 4515 "pmaddubsw %%xmm4,%%xmm1 \n"
michael@0 4516 "pmaddubsw %%xmm4,%%xmm7 \n"
michael@0 4517 "phaddsw %%xmm7,%%xmm1 \n"
michael@0 4518 "movdqa " MEMACCESS(0) ",%%xmm6 \n"
michael@0 4519 "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n"
michael@0 4520 "pmaddubsw %%xmm5,%%xmm6 \n"
michael@0 4521 "pmaddubsw %%xmm5,%%xmm7 \n"
michael@0 4522 "phaddsw %%xmm7,%%xmm6 \n"
michael@0 4523 "psraw $0x6,%%xmm1 \n"
michael@0 4524 "psraw $0x6,%%xmm6 \n"
michael@0 4525 "packuswb %%xmm1,%%xmm1 \n"
michael@0 4526 "packuswb %%xmm6,%%xmm6 \n"
michael@0 4527 "punpcklbw %%xmm6,%%xmm1 \n"
michael@0 4528 "movdqa %%xmm0,%%xmm6 \n"
michael@0 4529 "punpcklwd %%xmm1,%%xmm0 \n"
michael@0 4530 "punpckhwd %%xmm1,%%xmm6 \n"
michael@0 4531 "sub $0x8,%2 \n"
michael@0 4532 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 4533 "movdqa %%xmm6," MEMACCESS2(0x10,1) " \n"
michael@0 4534 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 4535 "lea " MEMLEA(0x20,1) ",%1 \n"
michael@0 4536 "jg 1b \n"
michael@0 4537 : "+r"(src_argb), // %0
michael@0 4538 "+r"(dst_argb), // %1
michael@0 4539 "+r"(width) // %2
michael@0 4540 : "r"(matrix_argb) // %3
michael@0 4541 : "memory", "cc"
michael@0 4542 #if defined(__SSE2__)
michael@0 4543 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
michael@0 4544 #endif
michael@0 4545 );
michael@0 4546 }
michael@0 4547 #endif // HAS_ARGBCOLORMATRIXROW_SSSE3
michael@0 4548
michael@0 4549 #ifdef HAS_ARGBQUANTIZEROW_SSE2
michael@0 4550 // Quantize 4 ARGB pixels (16 bytes).
michael@0 4551 // aligned to 16 bytes
michael@0 4552 void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
michael@0 4553 int interval_offset, int width) {
michael@0 4554 asm volatile (
michael@0 4555 "movd %2,%%xmm2 \n"
michael@0 4556 "movd %3,%%xmm3 \n"
michael@0 4557 "movd %4,%%xmm4 \n"
michael@0 4558 "pshuflw $0x40,%%xmm2,%%xmm2 \n"
michael@0 4559 "pshufd $0x44,%%xmm2,%%xmm2 \n"
michael@0 4560 "pshuflw $0x40,%%xmm3,%%xmm3 \n"
michael@0 4561 "pshufd $0x44,%%xmm3,%%xmm3 \n"
michael@0 4562 "pshuflw $0x40,%%xmm4,%%xmm4 \n"
michael@0 4563 "pshufd $0x44,%%xmm4,%%xmm4 \n"
michael@0 4564 "pxor %%xmm5,%%xmm5 \n"
michael@0 4565 "pcmpeqb %%xmm6,%%xmm6 \n"
michael@0 4566 "pslld $0x18,%%xmm6 \n"
michael@0 4567
michael@0 4568 // 4 pixel loop.
michael@0 4569 LABELALIGN
michael@0 4570 "1: \n"
michael@0 4571 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 4572 "punpcklbw %%xmm5,%%xmm0 \n"
michael@0 4573 "pmulhuw %%xmm2,%%xmm0 \n"
michael@0 4574 "movdqa " MEMACCESS(0) ",%%xmm1 \n"
michael@0 4575 "punpckhbw %%xmm5,%%xmm1 \n"
michael@0 4576 "pmulhuw %%xmm2,%%xmm1 \n"
michael@0 4577 "pmullw %%xmm3,%%xmm0 \n"
michael@0 4578 "movdqa " MEMACCESS(0) ",%%xmm7 \n"
michael@0 4579 "pmullw %%xmm3,%%xmm1 \n"
michael@0 4580 "pand %%xmm6,%%xmm7 \n"
michael@0 4581 "paddw %%xmm4,%%xmm0 \n"
michael@0 4582 "paddw %%xmm4,%%xmm1 \n"
michael@0 4583 "packuswb %%xmm1,%%xmm0 \n"
michael@0 4584 "por %%xmm7,%%xmm0 \n"
michael@0 4585 "sub $0x4,%1 \n"
michael@0 4586 "movdqa %%xmm0," MEMACCESS(0) " \n"
michael@0 4587 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 4588 "jg 1b \n"
michael@0 4589 : "+r"(dst_argb), // %0
michael@0 4590 "+r"(width) // %1
michael@0 4591 : "r"(scale), // %2
michael@0 4592 "r"(interval_size), // %3
michael@0 4593 "r"(interval_offset) // %4
michael@0 4594 : "memory", "cc"
michael@0 4595 #if defined(__SSE2__)
michael@0 4596 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
michael@0 4597 #endif
michael@0 4598 );
michael@0 4599 }
michael@0 4600 #endif // HAS_ARGBQUANTIZEROW_SSE2
michael@0 4601
michael@0 4602 #ifdef HAS_ARGBSHADEROW_SSE2
michael@0 4603 // Shade 4 pixels at a time by specified value.
michael@0 4604 // Aligned to 16 bytes.
michael@0 4605 void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
michael@0 4606 uint32 value) {
michael@0 4607 asm volatile (
michael@0 4608 "movd %3,%%xmm2 \n"
michael@0 4609 "punpcklbw %%xmm2,%%xmm2 \n"
michael@0 4610 "punpcklqdq %%xmm2,%%xmm2 \n"
michael@0 4611
michael@0 4612 // 4 pixel loop.
michael@0 4613 LABELALIGN
michael@0 4614 "1: \n"
michael@0 4615 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 4616 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 4617 "movdqa %%xmm0,%%xmm1 \n"
michael@0 4618 "punpcklbw %%xmm0,%%xmm0 \n"
michael@0 4619 "punpckhbw %%xmm1,%%xmm1 \n"
michael@0 4620 "pmulhuw %%xmm2,%%xmm0 \n"
michael@0 4621 "pmulhuw %%xmm2,%%xmm1 \n"
michael@0 4622 "psrlw $0x8,%%xmm0 \n"
michael@0 4623 "psrlw $0x8,%%xmm1 \n"
michael@0 4624 "packuswb %%xmm1,%%xmm0 \n"
michael@0 4625 "sub $0x4,%2 \n"
michael@0 4626 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 4627 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 4628 "jg 1b \n"
michael@0 4629 : "+r"(src_argb), // %0
michael@0 4630 "+r"(dst_argb), // %1
michael@0 4631 "+r"(width) // %2
michael@0 4632 : "r"(value) // %3
michael@0 4633 : "memory", "cc"
michael@0 4634 #if defined(__SSE2__)
michael@0 4635 , "xmm0", "xmm1", "xmm2"
michael@0 4636 #endif
michael@0 4637 );
michael@0 4638 }
michael@0 4639 #endif // HAS_ARGBSHADEROW_SSE2
michael@0 4640
michael@0 4641 #ifdef HAS_ARGBMULTIPLYROW_SSE2
michael@0 4642 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
michael@0 4643 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
michael@0 4644 uint8* dst_argb, int width) {
michael@0 4645 asm volatile (
michael@0 4646 "pxor %%xmm5,%%xmm5 \n"
michael@0 4647
michael@0 4648 // 4 pixel loop.
michael@0 4649 LABELALIGN
michael@0 4650 "1: \n"
michael@0 4651 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 4652 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 4653 "movdqu " MEMACCESS(1) ",%%xmm2 \n"
michael@0 4654 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 4655 "movdqu %%xmm0,%%xmm1 \n"
michael@0 4656 "movdqu %%xmm2,%%xmm3 \n"
michael@0 4657 "punpcklbw %%xmm0,%%xmm0 \n"
michael@0 4658 "punpckhbw %%xmm1,%%xmm1 \n"
michael@0 4659 "punpcklbw %%xmm5,%%xmm2 \n"
michael@0 4660 "punpckhbw %%xmm5,%%xmm3 \n"
michael@0 4661 "pmulhuw %%xmm2,%%xmm0 \n"
michael@0 4662 "pmulhuw %%xmm3,%%xmm1 \n"
michael@0 4663 "packuswb %%xmm1,%%xmm0 \n"
michael@0 4664 "sub $0x4,%3 \n"
michael@0 4665 "movdqu %%xmm0," MEMACCESS(2) " \n"
michael@0 4666 "lea " MEMLEA(0x10,2) ",%2 \n"
michael@0 4667 "jg 1b \n"
michael@0 4668 : "+r"(src_argb0), // %0
michael@0 4669 "+r"(src_argb1), // %1
michael@0 4670 "+r"(dst_argb), // %2
michael@0 4671 "+r"(width) // %3
michael@0 4672 :
michael@0 4673 : "memory", "cc"
michael@0 4674 #if defined(__SSE2__)
michael@0 4675 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
michael@0 4676 #endif
michael@0 4677 );
michael@0 4678 }
michael@0 4679 #endif // HAS_ARGBMULTIPLYROW_SSE2
michael@0 4680
michael@0 4681 #ifdef HAS_ARGBADDROW_SSE2
michael@0 4682 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
michael@0 4683 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
michael@0 4684 uint8* dst_argb, int width) {
michael@0 4685 asm volatile (
michael@0 4686 // 4 pixel loop.
michael@0 4687 LABELALIGN
michael@0 4688 "1: \n"
michael@0 4689 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 4690 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 4691 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
michael@0 4692 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 4693 "paddusb %%xmm1,%%xmm0 \n"
michael@0 4694 "sub $0x4,%3 \n"
michael@0 4695 "movdqu %%xmm0," MEMACCESS(2) " \n"
michael@0 4696 "lea " MEMLEA(0x10,2) ",%2 \n"
michael@0 4697 "jg 1b \n"
michael@0 4698 : "+r"(src_argb0), // %0
michael@0 4699 "+r"(src_argb1), // %1
michael@0 4700 "+r"(dst_argb), // %2
michael@0 4701 "+r"(width) // %3
michael@0 4702 :
michael@0 4703 : "memory", "cc"
michael@0 4704 #if defined(__SSE2__)
michael@0 4705 , "xmm0", "xmm1"
michael@0 4706 #endif
michael@0 4707 );
michael@0 4708 }
michael@0 4709 #endif // HAS_ARGBADDROW_SSE2
michael@0 4710
michael@0 4711 #ifdef HAS_ARGBSUBTRACTROW_SSE2
michael@0 4712 // Subtract 2 rows of ARGB pixels, 4 pixels at a time.
michael@0 4713 void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
michael@0 4714 uint8* dst_argb, int width) {
michael@0 4715 asm volatile (
michael@0 4716 // 4 pixel loop.
michael@0 4717 LABELALIGN
michael@0 4718 "1: \n"
michael@0 4719 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 4720 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 4721 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
michael@0 4722 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 4723 "psubusb %%xmm1,%%xmm0 \n"
michael@0 4724 "sub $0x4,%3 \n"
michael@0 4725 "movdqu %%xmm0," MEMACCESS(2) " \n"
michael@0 4726 "lea " MEMLEA(0x10,2) ",%2 \n"
michael@0 4727 "jg 1b \n"
michael@0 4728 : "+r"(src_argb0), // %0
michael@0 4729 "+r"(src_argb1), // %1
michael@0 4730 "+r"(dst_argb), // %2
michael@0 4731 "+r"(width) // %3
michael@0 4732 :
michael@0 4733 : "memory", "cc"
michael@0 4734 #if defined(__SSE2__)
michael@0 4735 , "xmm0", "xmm1"
michael@0 4736 #endif
michael@0 4737 );
michael@0 4738 }
michael@0 4739 #endif // HAS_ARGBSUBTRACTROW_SSE2
michael@0 4740
michael@0 4741 #ifdef HAS_SOBELXROW_SSE2
michael@0 4742 // SobelX as a matrix is
michael@0 4743 // -1 0 1
michael@0 4744 // -2 0 2
michael@0 4745 // -1 0 1
michael@0 4746 void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
michael@0 4747 const uint8* src_y2, uint8* dst_sobelx, int width) {
michael@0 4748 asm volatile (
michael@0 4749 "sub %0,%1 \n"
michael@0 4750 "sub %0,%2 \n"
michael@0 4751 "sub %0,%3 \n"
michael@0 4752 "pxor %%xmm5,%%xmm5 \n"
michael@0 4753
michael@0 4754 // 8 pixel loop.
michael@0 4755 LABELALIGN
michael@0 4756 "1: \n"
michael@0 4757 "movq " MEMACCESS(0) ",%%xmm0 \n"
michael@0 4758 "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n"
michael@0 4759 "punpcklbw %%xmm5,%%xmm0 \n"
michael@0 4760 "punpcklbw %%xmm5,%%xmm1 \n"
michael@0 4761 "psubw %%xmm1,%%xmm0 \n"
michael@0 4762 BUNDLEALIGN
michael@0 4763 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
michael@0 4764 MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2
michael@0 4765 "punpcklbw %%xmm5,%%xmm1 \n"
michael@0 4766 "punpcklbw %%xmm5,%%xmm2 \n"
michael@0 4767 "psubw %%xmm2,%%xmm1 \n"
michael@0 4768 BUNDLEALIGN
michael@0 4769 MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2
michael@0 4770 MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3
michael@0 4771 "punpcklbw %%xmm5,%%xmm2 \n"
michael@0 4772 "punpcklbw %%xmm5,%%xmm3 \n"
michael@0 4773 "psubw %%xmm3,%%xmm2 \n"
michael@0 4774 "paddw %%xmm2,%%xmm0 \n"
michael@0 4775 "paddw %%xmm1,%%xmm0 \n"
michael@0 4776 "paddw %%xmm1,%%xmm0 \n"
michael@0 4777 "pxor %%xmm1,%%xmm1 \n"
michael@0 4778 "psubw %%xmm0,%%xmm1 \n"
michael@0 4779 "pmaxsw %%xmm1,%%xmm0 \n"
michael@0 4780 "packuswb %%xmm0,%%xmm0 \n"
michael@0 4781 "sub $0x8,%4 \n"
michael@0 4782 BUNDLEALIGN
michael@0 4783 MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1)
michael@0 4784 "lea " MEMLEA(0x8,0) ",%0 \n"
michael@0 4785 "jg 1b \n"
michael@0 4786 : "+r"(src_y0), // %0
michael@0 4787 "+r"(src_y1), // %1
michael@0 4788 "+r"(src_y2), // %2
michael@0 4789 "+r"(dst_sobelx), // %3
michael@0 4790 "+r"(width) // %4
michael@0 4791 :
michael@0 4792 : "memory", "cc"
michael@0 4793 #if defined(__native_client__) && defined(__x86_64__)
michael@0 4794 , "r14"
michael@0 4795 #endif
michael@0 4796 #if defined(__SSE2__)
michael@0 4797 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
michael@0 4798 #endif
michael@0 4799 );
michael@0 4800 }
michael@0 4801 #endif // HAS_SOBELXROW_SSE2
michael@0 4802
michael@0 4803 #ifdef HAS_SOBELYROW_SSE2
michael@0 4804 // SobelY as a matrix is
michael@0 4805 // -1 -2 -1
michael@0 4806 // 0 0 0
michael@0 4807 // 1 2 1
michael@0 4808 void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
michael@0 4809 uint8* dst_sobely, int width) {
michael@0 4810 asm volatile (
michael@0 4811 "sub %0,%1 \n"
michael@0 4812 "sub %0,%2 \n"
michael@0 4813 "pxor %%xmm5,%%xmm5 \n"
michael@0 4814
michael@0 4815 // 8 pixel loop.
michael@0 4816 LABELALIGN
michael@0 4817 "1: \n"
michael@0 4818 "movq " MEMACCESS(0) ",%%xmm0 \n"
michael@0 4819 MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1
michael@0 4820 "punpcklbw %%xmm5,%%xmm0 \n"
michael@0 4821 "punpcklbw %%xmm5,%%xmm1 \n"
michael@0 4822 "psubw %%xmm1,%%xmm0 \n"
michael@0 4823 BUNDLEALIGN
michael@0 4824 "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n"
michael@0 4825 MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2
michael@0 4826 "punpcklbw %%xmm5,%%xmm1 \n"
michael@0 4827 "punpcklbw %%xmm5,%%xmm2 \n"
michael@0 4828 "psubw %%xmm2,%%xmm1 \n"
michael@0 4829 BUNDLEALIGN
michael@0 4830 "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n"
michael@0 4831 MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3
michael@0 4832 "punpcklbw %%xmm5,%%xmm2 \n"
michael@0 4833 "punpcklbw %%xmm5,%%xmm3 \n"
michael@0 4834 "psubw %%xmm3,%%xmm2 \n"
michael@0 4835 "paddw %%xmm2,%%xmm0 \n"
michael@0 4836 "paddw %%xmm1,%%xmm0 \n"
michael@0 4837 "paddw %%xmm1,%%xmm0 \n"
michael@0 4838 "pxor %%xmm1,%%xmm1 \n"
michael@0 4839 "psubw %%xmm0,%%xmm1 \n"
michael@0 4840 "pmaxsw %%xmm1,%%xmm0 \n"
michael@0 4841 "packuswb %%xmm0,%%xmm0 \n"
michael@0 4842 "sub $0x8,%3 \n"
michael@0 4843 BUNDLEALIGN
michael@0 4844 MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1)
michael@0 4845 "lea " MEMLEA(0x8,0) ",%0 \n"
michael@0 4846 "jg 1b \n"
michael@0 4847 : "+r"(src_y0), // %0
michael@0 4848 "+r"(src_y1), // %1
michael@0 4849 "+r"(dst_sobely), // %2
michael@0 4850 "+r"(width) // %3
michael@0 4851 :
michael@0 4852 : "memory", "cc"
michael@0 4853 #if defined(__native_client__) && defined(__x86_64__)
michael@0 4854 , "r14"
michael@0 4855 #endif
michael@0 4856 #if defined(__SSE2__)
michael@0 4857 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
michael@0 4858 #endif
michael@0 4859 );
michael@0 4860 }
michael@0 4861 #endif // HAS_SOBELYROW_SSE2
michael@0 4862
michael@0 4863 #ifdef HAS_SOBELROW_SSE2
michael@0 4864 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
michael@0 4865 // A = 255
michael@0 4866 // R = Sobel
michael@0 4867 // G = Sobel
michael@0 4868 // B = Sobel
michael@0 4869 void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
michael@0 4870 uint8* dst_argb, int width) {
michael@0 4871 asm volatile (
michael@0 4872 "sub %0,%1 \n"
michael@0 4873 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 4874 "pslld $0x18,%%xmm5 \n"
michael@0 4875
michael@0 4876 // 8 pixel loop.
michael@0 4877 LABELALIGN
michael@0 4878 "1: \n"
michael@0 4879 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 4880 MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
michael@0 4881 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 4882 "paddusb %%xmm1,%%xmm0 \n"
michael@0 4883 "movdqa %%xmm0,%%xmm2 \n"
michael@0 4884 "punpcklbw %%xmm0,%%xmm2 \n"
michael@0 4885 "punpckhbw %%xmm0,%%xmm0 \n"
michael@0 4886 "movdqa %%xmm2,%%xmm1 \n"
michael@0 4887 "punpcklwd %%xmm2,%%xmm1 \n"
michael@0 4888 "punpckhwd %%xmm2,%%xmm2 \n"
michael@0 4889 "por %%xmm5,%%xmm1 \n"
michael@0 4890 "por %%xmm5,%%xmm2 \n"
michael@0 4891 "movdqa %%xmm0,%%xmm3 \n"
michael@0 4892 "punpcklwd %%xmm0,%%xmm3 \n"
michael@0 4893 "punpckhwd %%xmm0,%%xmm0 \n"
michael@0 4894 "por %%xmm5,%%xmm3 \n"
michael@0 4895 "por %%xmm5,%%xmm0 \n"
michael@0 4896 "sub $0x10,%3 \n"
michael@0 4897 "movdqa %%xmm1," MEMACCESS(2) " \n"
michael@0 4898 "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n"
michael@0 4899 "movdqa %%xmm3," MEMACCESS2(0x20,2) " \n"
michael@0 4900 "movdqa %%xmm0," MEMACCESS2(0x30,2) " \n"
michael@0 4901 "lea " MEMLEA(0x40,2) ",%2 \n"
michael@0 4902 "jg 1b \n"
michael@0 4903 : "+r"(src_sobelx), // %0
michael@0 4904 "+r"(src_sobely), // %1
michael@0 4905 "+r"(dst_argb), // %2
michael@0 4906 "+r"(width) // %3
michael@0 4907 :
michael@0 4908 : "memory", "cc"
michael@0 4909 #if defined(__native_client__) && defined(__x86_64__)
michael@0 4910 , "r14"
michael@0 4911 #endif
michael@0 4912 #if defined(__SSE2__)
michael@0 4913 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
michael@0 4914 #endif
michael@0 4915 );
michael@0 4916 }
michael@0 4917 #endif // HAS_SOBELROW_SSE2
michael@0 4918
michael@0 4919 #ifdef HAS_SOBELTOPLANEROW_SSE2
michael@0 4920 // Adds Sobel X and Sobel Y and stores Sobel into a plane.
michael@0 4921 void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
michael@0 4922 uint8* dst_y, int width) {
michael@0 4923 asm volatile (
michael@0 4924 "sub %0,%1 \n"
michael@0 4925 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 4926 "pslld $0x18,%%xmm5 \n"
michael@0 4927
michael@0 4928 // 8 pixel loop.
michael@0 4929 LABELALIGN
michael@0 4930 "1: \n"
michael@0 4931 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 4932 MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
michael@0 4933 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 4934 "paddusb %%xmm1,%%xmm0 \n"
michael@0 4935 "sub $0x10,%3 \n"
michael@0 4936 "movdqa %%xmm0," MEMACCESS(2) " \n"
michael@0 4937 "lea " MEMLEA(0x10,2) ",%2 \n"
michael@0 4938 "jg 1b \n"
michael@0 4939 : "+r"(src_sobelx), // %0
michael@0 4940 "+r"(src_sobely), // %1
michael@0 4941 "+r"(dst_y), // %2
michael@0 4942 "+r"(width) // %3
michael@0 4943 :
michael@0 4944 : "memory", "cc"
michael@0 4945 #if defined(__native_client__) && defined(__x86_64__)
michael@0 4946 , "r14"
michael@0 4947 #endif
michael@0 4948 #if defined(__SSE2__)
michael@0 4949 , "xmm0", "xmm1"
michael@0 4950 #endif
michael@0 4951 );
michael@0 4952 }
michael@0 4953 #endif // HAS_SOBELTOPLANEROW_SSE2
michael@0 4954
michael@0 4955 #ifdef HAS_SOBELXYROW_SSE2
michael@0 4956 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
michael@0 4957 // A = 255
michael@0 4958 // R = Sobel X
michael@0 4959 // G = Sobel
michael@0 4960 // B = Sobel Y
michael@0 4961 void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
michael@0 4962 uint8* dst_argb, int width) {
michael@0 4963 asm volatile (
michael@0 4964 "sub %0,%1 \n"
michael@0 4965 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 4966
michael@0 4967 // 8 pixel loop.
michael@0 4968 LABELALIGN
michael@0 4969 "1: \n"
michael@0 4970 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 4971 MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1
michael@0 4972 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 4973 "movdqa %%xmm0,%%xmm2 \n"
michael@0 4974 "paddusb %%xmm1,%%xmm2 \n"
michael@0 4975 "movdqa %%xmm0,%%xmm3 \n"
michael@0 4976 "punpcklbw %%xmm5,%%xmm3 \n"
michael@0 4977 "punpckhbw %%xmm5,%%xmm0 \n"
michael@0 4978 "movdqa %%xmm1,%%xmm4 \n"
michael@0 4979 "punpcklbw %%xmm2,%%xmm4 \n"
michael@0 4980 "punpckhbw %%xmm2,%%xmm1 \n"
michael@0 4981 "movdqa %%xmm4,%%xmm6 \n"
michael@0 4982 "punpcklwd %%xmm3,%%xmm6 \n"
michael@0 4983 "punpckhwd %%xmm3,%%xmm4 \n"
michael@0 4984 "movdqa %%xmm1,%%xmm7 \n"
michael@0 4985 "punpcklwd %%xmm0,%%xmm7 \n"
michael@0 4986 "punpckhwd %%xmm0,%%xmm1 \n"
michael@0 4987 "sub $0x10,%3 \n"
michael@0 4988 "movdqa %%xmm6," MEMACCESS(2) " \n"
michael@0 4989 "movdqa %%xmm4," MEMACCESS2(0x10,2) " \n"
michael@0 4990 "movdqa %%xmm7," MEMACCESS2(0x20,2) " \n"
michael@0 4991 "movdqa %%xmm1," MEMACCESS2(0x30,2) " \n"
michael@0 4992 "lea " MEMLEA(0x40,2) ",%2 \n"
michael@0 4993 "jg 1b \n"
michael@0 4994 : "+r"(src_sobelx), // %0
michael@0 4995 "+r"(src_sobely), // %1
michael@0 4996 "+r"(dst_argb), // %2
michael@0 4997 "+r"(width) // %3
michael@0 4998 :
michael@0 4999 : "memory", "cc"
michael@0 5000 #if defined(__native_client__) && defined(__x86_64__)
michael@0 5001 , "r14"
michael@0 5002 #endif
michael@0 5003 #if defined(__SSE2__)
michael@0 5004 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
michael@0 5005 #endif
michael@0 5006 );
michael@0 5007 }
michael@0 5008 #endif // HAS_SOBELXYROW_SSE2
michael@0 5009
michael@0 5010 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
michael@0 5011 // Creates a table of cumulative sums where each value is a sum of all values
michael@0 5012 // above and to the left of the value, inclusive of the value.
michael@0 5013 void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
michael@0 5014 const int32* previous_cumsum, int width) {
michael@0 5015 asm volatile (
michael@0 5016 "pxor %%xmm0,%%xmm0 \n"
michael@0 5017 "pxor %%xmm1,%%xmm1 \n"
michael@0 5018 "sub $0x4,%3 \n"
michael@0 5019 "jl 49f \n"
michael@0 5020 "test $0xf,%1 \n"
michael@0 5021 "jne 49f \n"
michael@0 5022
michael@0 5023 // 4 pixel loop \n"
michael@0 5024 LABELALIGN
michael@0 5025 "40: \n"
michael@0 5026 "movdqu " MEMACCESS(0) ",%%xmm2 \n"
michael@0 5027 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 5028 "movdqa %%xmm2,%%xmm4 \n"
michael@0 5029 "punpcklbw %%xmm1,%%xmm2 \n"
michael@0 5030 "movdqa %%xmm2,%%xmm3 \n"
michael@0 5031 "punpcklwd %%xmm1,%%xmm2 \n"
michael@0 5032 "punpckhwd %%xmm1,%%xmm3 \n"
michael@0 5033 "punpckhbw %%xmm1,%%xmm4 \n"
michael@0 5034 "movdqa %%xmm4,%%xmm5 \n"
michael@0 5035 "punpcklwd %%xmm1,%%xmm4 \n"
michael@0 5036 "punpckhwd %%xmm1,%%xmm5 \n"
michael@0 5037 "paddd %%xmm2,%%xmm0 \n"
michael@0 5038 "movdqa " MEMACCESS(2) ",%%xmm2 \n"
michael@0 5039 "paddd %%xmm0,%%xmm2 \n"
michael@0 5040 "paddd %%xmm3,%%xmm0 \n"
michael@0 5041 "movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n"
michael@0 5042 "paddd %%xmm0,%%xmm3 \n"
michael@0 5043 "paddd %%xmm4,%%xmm0 \n"
michael@0 5044 "movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n"
michael@0 5045 "paddd %%xmm0,%%xmm4 \n"
michael@0 5046 "paddd %%xmm5,%%xmm0 \n"
michael@0 5047 "movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n"
michael@0 5048 "lea " MEMLEA(0x40,2) ",%2 \n"
michael@0 5049 "paddd %%xmm0,%%xmm5 \n"
michael@0 5050 "movdqa %%xmm2," MEMACCESS(1) " \n"
michael@0 5051 "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n"
michael@0 5052 "movdqa %%xmm4," MEMACCESS2(0x20,1) " \n"
michael@0 5053 "movdqa %%xmm5," MEMACCESS2(0x30,1) " \n"
michael@0 5054 "lea " MEMLEA(0x40,1) ",%1 \n"
michael@0 5055 "sub $0x4,%3 \n"
michael@0 5056 "jge 40b \n"
michael@0 5057
michael@0 5058 "49: \n"
michael@0 5059 "add $0x3,%3 \n"
michael@0 5060 "jl 19f \n"
michael@0 5061
michael@0 5062 // 1 pixel loop \n"
michael@0 5063 LABELALIGN
michael@0 5064 "10: \n"
michael@0 5065 "movd " MEMACCESS(0) ",%%xmm2 \n"
michael@0 5066 "lea " MEMLEA(0x4,0) ",%0 \n"
michael@0 5067 "punpcklbw %%xmm1,%%xmm2 \n"
michael@0 5068 "punpcklwd %%xmm1,%%xmm2 \n"
michael@0 5069 "paddd %%xmm2,%%xmm0 \n"
michael@0 5070 "movdqu " MEMACCESS(2) ",%%xmm2 \n"
michael@0 5071 "lea " MEMLEA(0x10,2) ",%2 \n"
michael@0 5072 "paddd %%xmm0,%%xmm2 \n"
michael@0 5073 "movdqu %%xmm2," MEMACCESS(1) " \n"
michael@0 5074 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5075 "sub $0x1,%3 \n"
michael@0 5076 "jge 10b \n"
michael@0 5077
michael@0 5078 "19: \n"
michael@0 5079 : "+r"(row), // %0
michael@0 5080 "+r"(cumsum), // %1
michael@0 5081 "+r"(previous_cumsum), // %2
michael@0 5082 "+r"(width) // %3
michael@0 5083 :
michael@0 5084 : "memory", "cc"
michael@0 5085 #if defined(__SSE2__)
michael@0 5086 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 5087 #endif
michael@0 5088 );
michael@0 5089 }
michael@0 5090 #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2
michael@0 5091
michael@0 5092 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
michael@0 5093 void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
michael@0 5094 int width, int area, uint8* dst,
michael@0 5095 int count) {
michael@0 5096 asm volatile (
michael@0 5097 "movd %5,%%xmm5 \n"
michael@0 5098 "cvtdq2ps %%xmm5,%%xmm5 \n"
michael@0 5099 "rcpss %%xmm5,%%xmm4 \n"
michael@0 5100 "pshufd $0x0,%%xmm4,%%xmm4 \n"
michael@0 5101 "sub $0x4,%3 \n"
michael@0 5102 "jl 49f \n"
michael@0 5103 "cmpl $0x80,%5 \n"
michael@0 5104 "ja 40f \n"
michael@0 5105
michael@0 5106 "pshufd $0x0,%%xmm5,%%xmm5 \n"
michael@0 5107 "pcmpeqb %%xmm6,%%xmm6 \n"
michael@0 5108 "psrld $0x10,%%xmm6 \n"
michael@0 5109 "cvtdq2ps %%xmm6,%%xmm6 \n"
michael@0 5110 "addps %%xmm6,%%xmm5 \n"
michael@0 5111 "mulps %%xmm4,%%xmm5 \n"
michael@0 5112 "cvtps2dq %%xmm5,%%xmm5 \n"
michael@0 5113 "packssdw %%xmm5,%%xmm5 \n"
michael@0 5114
michael@0 5115 // 4 pixel small loop \n"
michael@0 5116 LABELALIGN
michael@0 5117 "4: \n"
michael@0 5118 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 5119 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 5120 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 5121 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
michael@0 5122 BUNDLEALIGN
michael@0 5123 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
michael@0 5124 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
michael@0 5125 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
michael@0 5126 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
michael@0 5127 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 5128 "psubd " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5129 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
michael@0 5130 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
michael@0 5131 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
michael@0 5132 BUNDLEALIGN
michael@0 5133 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
michael@0 5134 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
michael@0 5135 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
michael@0 5136 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
michael@0 5137 "lea " MEMLEA(0x40,1) ",%1 \n"
michael@0 5138 "packssdw %%xmm1,%%xmm0 \n"
michael@0 5139 "packssdw %%xmm3,%%xmm2 \n"
michael@0 5140 "pmulhuw %%xmm5,%%xmm0 \n"
michael@0 5141 "pmulhuw %%xmm5,%%xmm2 \n"
michael@0 5142 "packuswb %%xmm2,%%xmm0 \n"
michael@0 5143 "movdqu %%xmm0," MEMACCESS(2) " \n"
michael@0 5144 "lea " MEMLEA(0x10,2) ",%2 \n"
michael@0 5145 "sub $0x4,%3 \n"
michael@0 5146 "jge 4b \n"
michael@0 5147 "jmp 49f \n"
michael@0 5148
michael@0 5149 // 4 pixel loop \n"
michael@0 5150 LABELALIGN
michael@0 5151 "40: \n"
michael@0 5152 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 5153 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 5154 "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n"
michael@0 5155 "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n"
michael@0 5156 BUNDLEALIGN
michael@0 5157 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
michael@0 5158 MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1
michael@0 5159 MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2
michael@0 5160 MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3
michael@0 5161 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 5162 "psubd " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5163 "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n"
michael@0 5164 "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n"
michael@0 5165 "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n"
michael@0 5166 BUNDLEALIGN
michael@0 5167 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
michael@0 5168 MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1
michael@0 5169 MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2
michael@0 5170 MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3
michael@0 5171 "lea " MEMLEA(0x40,1) ",%1 \n"
michael@0 5172 "cvtdq2ps %%xmm0,%%xmm0 \n"
michael@0 5173 "cvtdq2ps %%xmm1,%%xmm1 \n"
michael@0 5174 "mulps %%xmm4,%%xmm0 \n"
michael@0 5175 "mulps %%xmm4,%%xmm1 \n"
michael@0 5176 "cvtdq2ps %%xmm2,%%xmm2 \n"
michael@0 5177 "cvtdq2ps %%xmm3,%%xmm3 \n"
michael@0 5178 "mulps %%xmm4,%%xmm2 \n"
michael@0 5179 "mulps %%xmm4,%%xmm3 \n"
michael@0 5180 "cvtps2dq %%xmm0,%%xmm0 \n"
michael@0 5181 "cvtps2dq %%xmm1,%%xmm1 \n"
michael@0 5182 "cvtps2dq %%xmm2,%%xmm2 \n"
michael@0 5183 "cvtps2dq %%xmm3,%%xmm3 \n"
michael@0 5184 "packssdw %%xmm1,%%xmm0 \n"
michael@0 5185 "packssdw %%xmm3,%%xmm2 \n"
michael@0 5186 "packuswb %%xmm2,%%xmm0 \n"
michael@0 5187 "movdqu %%xmm0," MEMACCESS(2) " \n"
michael@0 5188 "lea " MEMLEA(0x10,2) ",%2 \n"
michael@0 5189 "sub $0x4,%3 \n"
michael@0 5190 "jge 40b \n"
michael@0 5191
michael@0 5192 "49: \n"
michael@0 5193 "add $0x3,%3 \n"
michael@0 5194 "jl 19f \n"
michael@0 5195
michael@0 5196 // 1 pixel loop \n"
michael@0 5197 LABELALIGN
michael@0 5198 "10: \n"
michael@0 5199 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 5200 MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0
michael@0 5201 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 5202 "psubd " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5203 BUNDLEALIGN
michael@0 5204 MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0
michael@0 5205 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5206 "cvtdq2ps %%xmm0,%%xmm0 \n"
michael@0 5207 "mulps %%xmm4,%%xmm0 \n"
michael@0 5208 "cvtps2dq %%xmm0,%%xmm0 \n"
michael@0 5209 "packssdw %%xmm0,%%xmm0 \n"
michael@0 5210 "packuswb %%xmm0,%%xmm0 \n"
michael@0 5211 "movd %%xmm0," MEMACCESS(2) " \n"
michael@0 5212 "lea " MEMLEA(0x4,2) ",%2 \n"
michael@0 5213 "sub $0x1,%3 \n"
michael@0 5214 "jge 10b \n"
michael@0 5215 "19: \n"
michael@0 5216 : "+r"(topleft), // %0
michael@0 5217 "+r"(botleft), // %1
michael@0 5218 "+r"(dst), // %2
michael@0 5219 "+rm"(count) // %3
michael@0 5220 : "r"((intptr_t)(width)), // %4
michael@0 5221 "rm"(area) // %5
michael@0 5222 : "memory", "cc"
michael@0 5223 #if defined(__native_client__) && defined(__x86_64__)
michael@0 5224 , "r14"
michael@0 5225 #endif
michael@0 5226 #if defined(__SSE2__)
michael@0 5227 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
michael@0 5228 #endif
michael@0 5229 );
michael@0 5230 }
michael@0 5231 #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
michael@0 5232
michael@0 5233 #ifdef HAS_ARGBAFFINEROW_SSE2
michael@0 5234 // Copy ARGB pixels from source image with slope to a row of destination.
michael@0 5235 LIBYUV_API
michael@0 5236 void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
michael@0 5237 uint8* dst_argb, const float* src_dudv, int width) {
michael@0 5238 intptr_t src_argb_stride_temp = src_argb_stride;
michael@0 5239 intptr_t temp = 0;
michael@0 5240 asm volatile (
michael@0 5241 "movq " MEMACCESS(3) ",%%xmm2 \n"
michael@0 5242 "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n"
michael@0 5243 "shl $0x10,%1 \n"
michael@0 5244 "add $0x4,%1 \n"
michael@0 5245 "movd %1,%%xmm5 \n"
michael@0 5246 "sub $0x4,%4 \n"
michael@0 5247 "jl 49f \n"
michael@0 5248
michael@0 5249 "pshufd $0x44,%%xmm7,%%xmm7 \n"
michael@0 5250 "pshufd $0x0,%%xmm5,%%xmm5 \n"
michael@0 5251 "movdqa %%xmm2,%%xmm0 \n"
michael@0 5252 "addps %%xmm7,%%xmm0 \n"
michael@0 5253 "movlhps %%xmm0,%%xmm2 \n"
michael@0 5254 "movdqa %%xmm7,%%xmm4 \n"
michael@0 5255 "addps %%xmm4,%%xmm4 \n"
michael@0 5256 "movdqa %%xmm2,%%xmm3 \n"
michael@0 5257 "addps %%xmm4,%%xmm3 \n"
michael@0 5258 "addps %%xmm4,%%xmm4 \n"
michael@0 5259
michael@0 5260 // 4 pixel loop \n"
michael@0 5261 LABELALIGN
michael@0 5262 "40: \n"
michael@0 5263 "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2
michael@0 5264 "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2
michael@0 5265 "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts
michael@0 5266 "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride
michael@0 5267 "movd %%xmm0,%k1 \n"
michael@0 5268 "pshufd $0x39,%%xmm0,%%xmm0 \n"
michael@0 5269 "movd %%xmm0,%k5 \n"
michael@0 5270 "pshufd $0x39,%%xmm0,%%xmm0 \n"
michael@0 5271 BUNDLEALIGN
michael@0 5272 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
michael@0 5273 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
michael@0 5274 "punpckldq %%xmm6,%%xmm1 \n"
michael@0 5275 "addps %%xmm4,%%xmm2 \n"
michael@0 5276 "movq %%xmm1," MEMACCESS(2) " \n"
michael@0 5277 "movd %%xmm0,%k1 \n"
michael@0 5278 "pshufd $0x39,%%xmm0,%%xmm0 \n"
michael@0 5279 "movd %%xmm0,%k5 \n"
michael@0 5280 BUNDLEALIGN
michael@0 5281 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
michael@0 5282 MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6
michael@0 5283 "punpckldq %%xmm6,%%xmm0 \n"
michael@0 5284 "addps %%xmm4,%%xmm3 \n"
michael@0 5285 "sub $0x4,%4 \n"
michael@0 5286 "movq %%xmm0," MEMACCESS2(0x08,2) " \n"
michael@0 5287 "lea " MEMLEA(0x10,2) ",%2 \n"
michael@0 5288 "jge 40b \n"
michael@0 5289
michael@0 5290 "49: \n"
michael@0 5291 "add $0x3,%4 \n"
michael@0 5292 "jl 19f \n"
michael@0 5293
michael@0 5294 // 1 pixel loop \n"
michael@0 5295 LABELALIGN
michael@0 5296 "10: \n"
michael@0 5297 "cvttps2dq %%xmm2,%%xmm0 \n"
michael@0 5298 "packssdw %%xmm0,%%xmm0 \n"
michael@0 5299 "pmaddwd %%xmm5,%%xmm0 \n"
michael@0 5300 "addps %%xmm7,%%xmm2 \n"
michael@0 5301 "movd %%xmm0,%k1 \n"
michael@0 5302 BUNDLEALIGN
michael@0 5303 MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0
michael@0 5304 "sub $0x1,%4 \n"
michael@0 5305 "movd %%xmm0," MEMACCESS(2) " \n"
michael@0 5306 "lea " MEMLEA(0x04,2) ",%2 \n"
michael@0 5307 "jge 10b \n"
michael@0 5308 "19: \n"
michael@0 5309 : "+r"(src_argb), // %0
michael@0 5310 "+r"(src_argb_stride_temp), // %1
michael@0 5311 "+r"(dst_argb), // %2
michael@0 5312 "+r"(src_dudv), // %3
michael@0 5313 "+rm"(width), // %4
michael@0 5314 "+r"(temp) // %5
michael@0 5315 :
michael@0 5316 : "memory", "cc"
michael@0 5317 #if defined(__native_client__) && defined(__x86_64__)
michael@0 5318 , "r14"
michael@0 5319 #endif
michael@0 5320 #if defined(__SSE2__)
michael@0 5321 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
michael@0 5322 #endif
michael@0 5323 );
michael@0 5324 }
michael@0 5325 #endif // HAS_ARGBAFFINEROW_SSE2
michael@0 5326
michael@0 5327 #ifdef HAS_INTERPOLATEROW_SSSE3
michael@0 5328 // Bilinear filter 16x2 -> 16x1
michael@0 5329 void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
michael@0 5330 ptrdiff_t src_stride, int dst_width,
michael@0 5331 int source_y_fraction) {
michael@0 5332 asm volatile (
michael@0 5333 "sub %1,%0 \n"
michael@0 5334 "shr %3 \n"
michael@0 5335 "cmp $0x0,%3 \n"
michael@0 5336 "je 100f \n"
michael@0 5337 "cmp $0x20,%3 \n"
michael@0 5338 "je 75f \n"
michael@0 5339 "cmp $0x40,%3 \n"
michael@0 5340 "je 50f \n"
michael@0 5341 "cmp $0x60,%3 \n"
michael@0 5342 "je 25f \n"
michael@0 5343
michael@0 5344 "movd %3,%%xmm0 \n"
michael@0 5345 "neg %3 \n"
michael@0 5346 "add $0x80,%3 \n"
michael@0 5347 "movd %3,%%xmm5 \n"
michael@0 5348 "punpcklbw %%xmm0,%%xmm5 \n"
michael@0 5349 "punpcklwd %%xmm5,%%xmm5 \n"
michael@0 5350 "pshufd $0x0,%%xmm5,%%xmm5 \n"
michael@0 5351
michael@0 5352 // General purpose row blend.
michael@0 5353 LABELALIGN
michael@0 5354 "1: \n"
michael@0 5355 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5356 MEMOPREG(movdqa,0x00,1,4,1,xmm2)
michael@0 5357 "movdqa %%xmm0,%%xmm1 \n"
michael@0 5358 "punpcklbw %%xmm2,%%xmm0 \n"
michael@0 5359 "punpckhbw %%xmm2,%%xmm1 \n"
michael@0 5360 "pmaddubsw %%xmm5,%%xmm0 \n"
michael@0 5361 "pmaddubsw %%xmm5,%%xmm1 \n"
michael@0 5362 "psrlw $0x7,%%xmm0 \n"
michael@0 5363 "psrlw $0x7,%%xmm1 \n"
michael@0 5364 "packuswb %%xmm1,%%xmm0 \n"
michael@0 5365 "sub $0x10,%2 \n"
michael@0 5366 BUNDLEALIGN
michael@0 5367 MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
michael@0 5368 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5369 "jg 1b \n"
michael@0 5370 "jmp 99f \n"
michael@0 5371
michael@0 5372 // Blend 25 / 75.
michael@0 5373 LABELALIGN
michael@0 5374 "25: \n"
michael@0 5375 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5376 MEMOPREG(movdqa,0x00,1,4,1,xmm1)
michael@0 5377 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5378 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5379 "sub $0x10,%2 \n"
michael@0 5380 BUNDLEALIGN
michael@0 5381 MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
michael@0 5382 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5383 "jg 25b \n"
michael@0 5384 "jmp 99f \n"
michael@0 5385
michael@0 5386 // Blend 50 / 50.
michael@0 5387 LABELALIGN
michael@0 5388 "50: \n"
michael@0 5389 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5390 MEMOPREG(movdqa,0x00,1,4,1,xmm1)
michael@0 5391 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5392 "sub $0x10,%2 \n"
michael@0 5393 BUNDLEALIGN
michael@0 5394 MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
michael@0 5395 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5396 "jg 50b \n"
michael@0 5397 "jmp 99f \n"
michael@0 5398
michael@0 5399 // Blend 75 / 25.
michael@0 5400 LABELALIGN
michael@0 5401 "75: \n"
michael@0 5402 "movdqa " MEMACCESS(1) ",%%xmm1 \n"
michael@0 5403 MEMOPREG(movdqa,0x00,1,4,1,xmm0)
michael@0 5404 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5405 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5406 "sub $0x10,%2 \n"
michael@0 5407 BUNDLEALIGN
michael@0 5408 MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
michael@0 5409 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5410 "jg 75b \n"
michael@0 5411 "jmp 99f \n"
michael@0 5412
michael@0 5413 // Blend 100 / 0 - Copy row unchanged.
michael@0 5414 LABELALIGN
michael@0 5415 "100: \n"
michael@0 5416 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5417 "sub $0x10,%2 \n"
michael@0 5418 MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
michael@0 5419 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5420 "jg 100b \n"
michael@0 5421
michael@0 5422 "99: \n"
michael@0 5423 : "+r"(dst_ptr), // %0
michael@0 5424 "+r"(src_ptr), // %1
michael@0 5425 "+r"(dst_width), // %2
michael@0 5426 "+r"(source_y_fraction) // %3
michael@0 5427 : "r"((intptr_t)(src_stride)) // %4
michael@0 5428 : "memory", "cc"
michael@0 5429 #if defined(__native_client__) && defined(__x86_64__)
michael@0 5430 , "r14"
michael@0 5431 #endif
michael@0 5432 #if defined(__SSE2__)
michael@0 5433 , "xmm0", "xmm1", "xmm2", "xmm5"
michael@0 5434 #endif
michael@0 5435 );
michael@0 5436 }
michael@0 5437 #endif // HAS_INTERPOLATEROW_SSSE3
michael@0 5438
michael@0 5439 #ifdef HAS_INTERPOLATEROW_SSE2
michael@0 5440 // Bilinear filter 16x2 -> 16x1
michael@0 5441 void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
michael@0 5442 ptrdiff_t src_stride, int dst_width,
michael@0 5443 int source_y_fraction) {
michael@0 5444 asm volatile (
michael@0 5445 "sub %1,%0 \n"
michael@0 5446 "shr %3 \n"
michael@0 5447 "cmp $0x0,%3 \n"
michael@0 5448 "je 100f \n"
michael@0 5449 "cmp $0x20,%3 \n"
michael@0 5450 "je 75f \n"
michael@0 5451 "cmp $0x40,%3 \n"
michael@0 5452 "je 50f \n"
michael@0 5453 "cmp $0x60,%3 \n"
michael@0 5454 "je 25f \n"
michael@0 5455
michael@0 5456 "movd %3,%%xmm0 \n"
michael@0 5457 "neg %3 \n"
michael@0 5458 "add $0x80,%3 \n"
michael@0 5459 "movd %3,%%xmm5 \n"
michael@0 5460 "punpcklbw %%xmm0,%%xmm5 \n"
michael@0 5461 "punpcklwd %%xmm5,%%xmm5 \n"
michael@0 5462 "pshufd $0x0,%%xmm5,%%xmm5 \n"
michael@0 5463 "pxor %%xmm4,%%xmm4 \n"
michael@0 5464
michael@0 5465 // General purpose row blend.
michael@0 5466 LABELALIGN
michael@0 5467 "1: \n"
michael@0 5468 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5469 MEMOPREG(movdqa,0x00,1,4,1,xmm2) // movdqa (%1,%4,1),%%xmm2
michael@0 5470 "movdqa %%xmm0,%%xmm1 \n"
michael@0 5471 "movdqa %%xmm2,%%xmm3 \n"
michael@0 5472 "punpcklbw %%xmm4,%%xmm2 \n"
michael@0 5473 "punpckhbw %%xmm4,%%xmm3 \n"
michael@0 5474 "punpcklbw %%xmm4,%%xmm0 \n"
michael@0 5475 "punpckhbw %%xmm4,%%xmm1 \n"
michael@0 5476 "psubw %%xmm0,%%xmm2 \n"
michael@0 5477 "psubw %%xmm1,%%xmm3 \n"
michael@0 5478 "paddw %%xmm2,%%xmm2 \n"
michael@0 5479 "paddw %%xmm3,%%xmm3 \n"
michael@0 5480 "pmulhw %%xmm5,%%xmm2 \n"
michael@0 5481 "pmulhw %%xmm5,%%xmm3 \n"
michael@0 5482 "paddw %%xmm2,%%xmm0 \n"
michael@0 5483 "paddw %%xmm3,%%xmm1 \n"
michael@0 5484 "packuswb %%xmm1,%%xmm0 \n"
michael@0 5485 "sub $0x10,%2 \n"
michael@0 5486 BUNDLEALIGN
michael@0 5487 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
michael@0 5488 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5489 "jg 1b \n"
michael@0 5490 "jmp 99f \n"
michael@0 5491
michael@0 5492 // Blend 25 / 75.
michael@0 5493 LABELALIGN
michael@0 5494 "25: \n"
michael@0 5495 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5496 MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1
michael@0 5497 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5498 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5499 "sub $0x10,%2 \n"
michael@0 5500 BUNDLEALIGN
michael@0 5501 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
michael@0 5502 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5503 "jg 25b \n"
michael@0 5504 "jmp 99f \n"
michael@0 5505
michael@0 5506 // Blend 50 / 50.
michael@0 5507 LABELALIGN
michael@0 5508 "50: \n"
michael@0 5509 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5510 MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1
michael@0 5511 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5512 "sub $0x10,%2 \n"
michael@0 5513 BUNDLEALIGN
michael@0 5514 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
michael@0 5515 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5516 "jg 50b \n"
michael@0 5517 "jmp 99f \n"
michael@0 5518
michael@0 5519 // Blend 75 / 25.
michael@0 5520 LABELALIGN
michael@0 5521 "75: \n"
michael@0 5522 "movdqa " MEMACCESS(1) ",%%xmm1 \n"
michael@0 5523 MEMOPREG(movdqa,0x00,1,4,1,xmm0) // movdqa (%1,%4,1),%%xmm0
michael@0 5524 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5525 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5526 "sub $0x10,%2 \n"
michael@0 5527 BUNDLEALIGN
michael@0 5528 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
michael@0 5529 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5530 "jg 75b \n"
michael@0 5531 "jmp 99f \n"
michael@0 5532
michael@0 5533 // Blend 100 / 0 - Copy row unchanged.
michael@0 5534 LABELALIGN
michael@0 5535 "100: \n"
michael@0 5536 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5537 "sub $0x10,%2 \n"
michael@0 5538 MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1)
michael@0 5539 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5540 "jg 100b \n"
michael@0 5541
michael@0 5542 "99: \n"
michael@0 5543 : "+r"(dst_ptr), // %0
michael@0 5544 "+r"(src_ptr), // %1
michael@0 5545 "+r"(dst_width), // %2
michael@0 5546 "+r"(source_y_fraction) // %3
michael@0 5547 : "r"((intptr_t)(src_stride)) // %4
michael@0 5548 : "memory", "cc"
michael@0 5549 #if defined(__native_client__) && defined(__x86_64__)
michael@0 5550 , "r14"
michael@0 5551 #endif
michael@0 5552 #if defined(__SSE2__)
michael@0 5553 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 5554 #endif
michael@0 5555 );
michael@0 5556 }
michael@0 5557 #endif // HAS_INTERPOLATEROW_SSE2
michael@0 5558
michael@0 5559 #ifdef HAS_INTERPOLATEROW_SSSE3
michael@0 5560 // Bilinear filter 16x2 -> 16x1
michael@0 5561 void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
michael@0 5562 ptrdiff_t src_stride, int dst_width,
michael@0 5563 int source_y_fraction) {
michael@0 5564 asm volatile (
michael@0 5565 "sub %1,%0 \n"
michael@0 5566 "shr %3 \n"
michael@0 5567 "cmp $0x0,%3 \n"
michael@0 5568 "je 100f \n"
michael@0 5569 "cmp $0x20,%3 \n"
michael@0 5570 "je 75f \n"
michael@0 5571 "cmp $0x40,%3 \n"
michael@0 5572 "je 50f \n"
michael@0 5573 "cmp $0x60,%3 \n"
michael@0 5574 "je 25f \n"
michael@0 5575
michael@0 5576 "movd %3,%%xmm0 \n"
michael@0 5577 "neg %3 \n"
michael@0 5578 "add $0x80,%3 \n"
michael@0 5579 "movd %3,%%xmm5 \n"
michael@0 5580 "punpcklbw %%xmm0,%%xmm5 \n"
michael@0 5581 "punpcklwd %%xmm5,%%xmm5 \n"
michael@0 5582 "pshufd $0x0,%%xmm5,%%xmm5 \n"
michael@0 5583
michael@0 5584 // General purpose row blend.
michael@0 5585 LABELALIGN
michael@0 5586 "1: \n"
michael@0 5587 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5588 MEMOPREG(movdqu,0x00,1,4,1,xmm2)
michael@0 5589 "movdqu %%xmm0,%%xmm1 \n"
michael@0 5590 "punpcklbw %%xmm2,%%xmm0 \n"
michael@0 5591 "punpckhbw %%xmm2,%%xmm1 \n"
michael@0 5592 "pmaddubsw %%xmm5,%%xmm0 \n"
michael@0 5593 "pmaddubsw %%xmm5,%%xmm1 \n"
michael@0 5594 "psrlw $0x7,%%xmm0 \n"
michael@0 5595 "psrlw $0x7,%%xmm1 \n"
michael@0 5596 "packuswb %%xmm1,%%xmm0 \n"
michael@0 5597 "sub $0x10,%2 \n"
michael@0 5598 BUNDLEALIGN
michael@0 5599 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
michael@0 5600 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5601 "jg 1b \n"
michael@0 5602 "jmp 99f \n"
michael@0 5603
michael@0 5604 // Blend 25 / 75.
michael@0 5605 LABELALIGN
michael@0 5606 "25: \n"
michael@0 5607 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5608 MEMOPREG(movdqu,0x00,1,4,1,xmm1)
michael@0 5609 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5610 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5611 "sub $0x10,%2 \n"
michael@0 5612 BUNDLEALIGN
michael@0 5613 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
michael@0 5614 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5615 "jg 25b \n"
michael@0 5616 "jmp 99f \n"
michael@0 5617
michael@0 5618 // Blend 50 / 50.
michael@0 5619 LABELALIGN
michael@0 5620 "50: \n"
michael@0 5621 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5622 MEMOPREG(movdqu,0x00,1,4,1,xmm1)
michael@0 5623 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5624 "sub $0x10,%2 \n"
michael@0 5625 BUNDLEALIGN
michael@0 5626 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
michael@0 5627 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5628 "jg 50b \n"
michael@0 5629 "jmp 99f \n"
michael@0 5630
michael@0 5631 // Blend 75 / 25.
michael@0 5632 LABELALIGN
michael@0 5633 "75: \n"
michael@0 5634 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
michael@0 5635 MEMOPREG(movdqu,0x00,1,4,1,xmm0)
michael@0 5636 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5637 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5638 "sub $0x10,%2 \n"
michael@0 5639 BUNDLEALIGN
michael@0 5640 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
michael@0 5641 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5642 "jg 75b \n"
michael@0 5643 "jmp 99f \n"
michael@0 5644
michael@0 5645 // Blend 100 / 0 - Copy row unchanged.
michael@0 5646 LABELALIGN
michael@0 5647 "100: \n"
michael@0 5648 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5649 "sub $0x10,%2 \n"
michael@0 5650 MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
michael@0 5651 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5652 "jg 100b \n"
michael@0 5653
michael@0 5654 "99: \n"
michael@0 5655 : "+r"(dst_ptr), // %0
michael@0 5656 "+r"(src_ptr), // %1
michael@0 5657 "+r"(dst_width), // %2
michael@0 5658 "+r"(source_y_fraction) // %3
michael@0 5659 : "r"((intptr_t)(src_stride)) // %4
michael@0 5660 : "memory", "cc"
michael@0 5661 #if defined(__native_client__) && defined(__x86_64__)
michael@0 5662 , "r14"
michael@0 5663 #endif
michael@0 5664 #if defined(__SSE2__)
michael@0 5665 , "xmm0", "xmm1", "xmm2", "xmm5"
michael@0 5666 #endif
michael@0 5667 );
michael@0 5668 }
michael@0 5669 #endif // HAS_INTERPOLATEROW_SSSE3
michael@0 5670
michael@0 5671 #ifdef HAS_INTERPOLATEROW_SSE2
michael@0 5672 // Bilinear filter 16x2 -> 16x1
michael@0 5673 void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
michael@0 5674 ptrdiff_t src_stride, int dst_width,
michael@0 5675 int source_y_fraction) {
michael@0 5676 asm volatile (
michael@0 5677 "sub %1,%0 \n"
michael@0 5678 "shr %3 \n"
michael@0 5679 "cmp $0x0,%3 \n"
michael@0 5680 "je 100f \n"
michael@0 5681 "cmp $0x20,%3 \n"
michael@0 5682 "je 75f \n"
michael@0 5683 "cmp $0x40,%3 \n"
michael@0 5684 "je 50f \n"
michael@0 5685 "cmp $0x60,%3 \n"
michael@0 5686 "je 25f \n"
michael@0 5687
michael@0 5688 "movd %3,%%xmm0 \n"
michael@0 5689 "neg %3 \n"
michael@0 5690 "add $0x80,%3 \n"
michael@0 5691 "movd %3,%%xmm5 \n"
michael@0 5692 "punpcklbw %%xmm0,%%xmm5 \n"
michael@0 5693 "punpcklwd %%xmm5,%%xmm5 \n"
michael@0 5694 "pshufd $0x0,%%xmm5,%%xmm5 \n"
michael@0 5695 "pxor %%xmm4,%%xmm4 \n"
michael@0 5696
michael@0 5697 // General purpose row blend.
michael@0 5698 LABELALIGN
michael@0 5699 "1: \n"
michael@0 5700 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5701 MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2
michael@0 5702 "movdqu %%xmm0,%%xmm1 \n"
michael@0 5703 "movdqu %%xmm2,%%xmm3 \n"
michael@0 5704 "punpcklbw %%xmm4,%%xmm2 \n"
michael@0 5705 "punpckhbw %%xmm4,%%xmm3 \n"
michael@0 5706 "punpcklbw %%xmm4,%%xmm0 \n"
michael@0 5707 "punpckhbw %%xmm4,%%xmm1 \n"
michael@0 5708 "psubw %%xmm0,%%xmm2 \n"
michael@0 5709 "psubw %%xmm1,%%xmm3 \n"
michael@0 5710 "paddw %%xmm2,%%xmm2 \n"
michael@0 5711 "paddw %%xmm3,%%xmm3 \n"
michael@0 5712 "pmulhw %%xmm5,%%xmm2 \n"
michael@0 5713 "pmulhw %%xmm5,%%xmm3 \n"
michael@0 5714 "paddw %%xmm2,%%xmm0 \n"
michael@0 5715 "paddw %%xmm3,%%xmm1 \n"
michael@0 5716 "packuswb %%xmm1,%%xmm0 \n"
michael@0 5717 "sub $0x10,%2 \n"
michael@0 5718 BUNDLEALIGN
michael@0 5719 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
michael@0 5720 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5721 "jg 1b \n"
michael@0 5722 "jmp 99f \n"
michael@0 5723
michael@0 5724 // Blend 25 / 75.
michael@0 5725 LABELALIGN
michael@0 5726 "25: \n"
michael@0 5727 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5728 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
michael@0 5729 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5730 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5731 "sub $0x10,%2 \n"
michael@0 5732 BUNDLEALIGN
michael@0 5733 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
michael@0 5734 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5735 "jg 25b \n"
michael@0 5736 "jmp 99f \n"
michael@0 5737
michael@0 5738 // Blend 50 / 50.
michael@0 5739 LABELALIGN
michael@0 5740 "50: \n"
michael@0 5741 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5742 MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1
michael@0 5743 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5744 "sub $0x10,%2 \n"
michael@0 5745 BUNDLEALIGN
michael@0 5746 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
michael@0 5747 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5748 "jg 50b \n"
michael@0 5749 "jmp 99f \n"
michael@0 5750
michael@0 5751 // Blend 75 / 25.
michael@0 5752 LABELALIGN
michael@0 5753 "75: \n"
michael@0 5754 "movdqu " MEMACCESS(1) ",%%xmm1 \n"
michael@0 5755 MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0
michael@0 5756 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5757 "pavgb %%xmm1,%%xmm0 \n"
michael@0 5758 "sub $0x10,%2 \n"
michael@0 5759 BUNDLEALIGN
michael@0 5760 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
michael@0 5761 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5762 "jg 75b \n"
michael@0 5763 "jmp 99f \n"
michael@0 5764
michael@0 5765 // Blend 100 / 0 - Copy row unchanged.
michael@0 5766 LABELALIGN
michael@0 5767 "100: \n"
michael@0 5768 "movdqu " MEMACCESS(1) ",%%xmm0 \n"
michael@0 5769 "sub $0x10,%2 \n"
michael@0 5770 MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1)
michael@0 5771 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 5772 "jg 100b \n"
michael@0 5773
michael@0 5774 "99: \n"
michael@0 5775 : "+r"(dst_ptr), // %0
michael@0 5776 "+r"(src_ptr), // %1
michael@0 5777 "+r"(dst_width), // %2
michael@0 5778 "+r"(source_y_fraction) // %3
michael@0 5779 : "r"((intptr_t)(src_stride)) // %4
michael@0 5780 : "memory", "cc"
michael@0 5781 #if defined(__native_client__) && defined(__x86_64__)
michael@0 5782 , "r14"
michael@0 5783 #endif
michael@0 5784 #if defined(__SSE2__)
michael@0 5785 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 5786 #endif
michael@0 5787 );
michael@0 5788 }
michael@0 5789 #endif // HAS_INTERPOLATEROW_SSE2
michael@0 5790
michael@0 5791 #ifdef HAS_HALFROW_SSE2
michael@0 5792 void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
michael@0 5793 uint8* dst_uv, int pix) {
michael@0 5794 asm volatile (
michael@0 5795 "sub %0,%1 \n"
michael@0 5796 LABELALIGN
michael@0 5797 "1: \n"
michael@0 5798 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 5799 MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3),%%xmm0
michael@0 5800 "sub $0x10,%2 \n"
michael@0 5801 MEMOPMEM(movdqa,xmm0,0x00,0,1,1) // movdqa %%xmm0,(%0,%1)
michael@0 5802 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 5803 "jg 1b \n"
michael@0 5804 : "+r"(src_uv), // %0
michael@0 5805 "+r"(dst_uv), // %1
michael@0 5806 "+r"(pix) // %2
michael@0 5807 : "r"((intptr_t)(src_uv_stride)) // %3
michael@0 5808 : "memory", "cc"
michael@0 5809 #if defined(__SSE2__)
michael@0 5810 , "xmm0"
michael@0 5811 #endif
michael@0 5812 );
michael@0 5813 }
michael@0 5814 #endif // HAS_HALFROW_SSE2
michael@0 5815
michael@0 5816 #ifdef HAS_ARGBTOBAYERROW_SSSE3
michael@0 5817 void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
michael@0 5818 uint32 selector, int pix) {
michael@0 5819 asm volatile (
michael@0 5820 // NaCL caveat - assumes movd is from GPR
michael@0 5821 "movd %3,%%xmm5 \n"
michael@0 5822 "pshufd $0x0,%%xmm5,%%xmm5 \n"
michael@0 5823 LABELALIGN
michael@0 5824 "1: \n"
michael@0 5825 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 5826 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 5827 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 5828 "pshufb %%xmm5,%%xmm0 \n"
michael@0 5829 "pshufb %%xmm5,%%xmm1 \n"
michael@0 5830 "punpckldq %%xmm1,%%xmm0 \n"
michael@0 5831 "sub $0x8,%2 \n"
michael@0 5832 "movq %%xmm0," MEMACCESS(1) " \n"
michael@0 5833 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 5834 "jg 1b \n"
michael@0 5835 : "+r"(src_argb), // %0
michael@0 5836 "+r"(dst_bayer), // %1
michael@0 5837 "+r"(pix) // %2
michael@0 5838 : "g"(selector) // %3
michael@0 5839 : "memory", "cc"
michael@0 5840 #if defined(__SSE2__)
michael@0 5841 , "xmm0", "xmm1", "xmm5"
michael@0 5842 #endif
michael@0 5843 );
michael@0 5844 }
michael@0 5845 #endif // HAS_ARGBTOBAYERROW_SSSE3
michael@0 5846
michael@0 5847 #ifdef HAS_ARGBTOBAYERGGROW_SSE2
michael@0 5848 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
michael@0 5849 uint32 selector, int pix) {
michael@0 5850 asm volatile (
michael@0 5851 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 5852 "psrld $0x18,%%xmm5 \n"
michael@0 5853 LABELALIGN
michael@0 5854 "1: \n"
michael@0 5855 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 5856 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 5857 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 5858 "psrld $0x8,%%xmm0 \n"
michael@0 5859 "psrld $0x8,%%xmm1 \n"
michael@0 5860 "pand %%xmm5,%%xmm0 \n"
michael@0 5861 "pand %%xmm5,%%xmm1 \n"
michael@0 5862 "packssdw %%xmm1,%%xmm0 \n"
michael@0 5863 "packuswb %%xmm1,%%xmm0 \n"
michael@0 5864 "sub $0x8,%2 \n"
michael@0 5865 "movq %%xmm0," MEMACCESS(1) " \n"
michael@0 5866 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 5867 "jg 1b \n"
michael@0 5868 : "+r"(src_argb), // %0
michael@0 5869 "+r"(dst_bayer), // %1
michael@0 5870 "+r"(pix) // %2
michael@0 5871 :
michael@0 5872 : "memory", "cc"
michael@0 5873 #if defined(__SSE2__)
michael@0 5874 , "xmm0", "xmm1", "xmm5"
michael@0 5875 #endif
michael@0 5876 );
michael@0 5877 }
michael@0 5878 #endif // HAS_ARGBTOBAYERGGROW_SSE2
michael@0 5879
michael@0 5880 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
michael@0 5881 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
michael@0 5882 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
michael@0 5883 const uint8* shuffler, int pix) {
michael@0 5884 asm volatile (
michael@0 5885 "movdqa " MEMACCESS(3) ",%%xmm5 \n"
michael@0 5886 LABELALIGN
michael@0 5887 "1: \n"
michael@0 5888 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 5889 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 5890 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 5891 "pshufb %%xmm5,%%xmm0 \n"
michael@0 5892 "pshufb %%xmm5,%%xmm1 \n"
michael@0 5893 "sub $0x8,%2 \n"
michael@0 5894 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 5895 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
michael@0 5896 "lea " MEMLEA(0x20,1) ",%1 \n"
michael@0 5897 "jg 1b \n"
michael@0 5898 : "+r"(src_argb), // %0
michael@0 5899 "+r"(dst_argb), // %1
michael@0 5900 "+r"(pix) // %2
michael@0 5901 : "r"(shuffler) // %3
michael@0 5902 : "memory", "cc"
michael@0 5903 #if defined(__SSE2__)
michael@0 5904 , "xmm0", "xmm1", "xmm5"
michael@0 5905 #endif
michael@0 5906 );
michael@0 5907 }
michael@0 5908
michael@0 5909 void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
michael@0 5910 const uint8* shuffler, int pix) {
michael@0 5911 asm volatile (
michael@0 5912 "movdqa " MEMACCESS(3) ",%%xmm5 \n"
michael@0 5913 LABELALIGN
michael@0 5914 "1: \n"
michael@0 5915 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 5916 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 5917 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 5918 "pshufb %%xmm5,%%xmm0 \n"
michael@0 5919 "pshufb %%xmm5,%%xmm1 \n"
michael@0 5920 "sub $0x8,%2 \n"
michael@0 5921 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 5922 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
michael@0 5923 "lea " MEMLEA(0x20,1) ",%1 \n"
michael@0 5924 "jg 1b \n"
michael@0 5925 : "+r"(src_argb), // %0
michael@0 5926 "+r"(dst_argb), // %1
michael@0 5927 "+r"(pix) // %2
michael@0 5928 : "r"(shuffler) // %3
michael@0 5929 : "memory", "cc"
michael@0 5930 #if defined(__SSE2__)
michael@0 5931 , "xmm0", "xmm1", "xmm5"
michael@0 5932 #endif
michael@0 5933 );
michael@0 5934 }
michael@0 5935 #endif // HAS_ARGBSHUFFLEROW_SSSE3
michael@0 5936
michael@0 5937 #ifdef HAS_ARGBSHUFFLEROW_AVX2
michael@0 5938 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
michael@0 5939 void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
michael@0 5940 const uint8* shuffler, int pix) {
michael@0 5941 asm volatile (
michael@0 5942 "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n"
michael@0 5943 LABELALIGN
michael@0 5944 "1: \n"
michael@0 5945 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
michael@0 5946 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
michael@0 5947 "lea " MEMLEA(0x40,0) ",%0 \n"
michael@0 5948 "vpshufb %%ymm5,%%ymm0,%%ymm0 \n"
michael@0 5949 "vpshufb %%ymm5,%%ymm1,%%ymm1 \n"
michael@0 5950 "sub $0x10,%2 \n"
michael@0 5951 "vmovdqu %%ymm0," MEMACCESS(1) " \n"
michael@0 5952 "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n"
michael@0 5953 "lea " MEMLEA(0x40,1) ",%1 \n"
michael@0 5954 "jg 1b \n"
michael@0 5955 : "+r"(src_argb), // %0
michael@0 5956 "+r"(dst_argb), // %1
michael@0 5957 "+r"(pix) // %2
michael@0 5958 : "r"(shuffler) // %3
michael@0 5959 : "memory", "cc"
michael@0 5960 #if defined(__SSE2__)
michael@0 5961 , "xmm0", "xmm1", "xmm5"
michael@0 5962 #endif
michael@0 5963 );
michael@0 5964 }
michael@0 5965 #endif // HAS_ARGBSHUFFLEROW_AVX2
michael@0 5966
michael@0 5967 #ifdef HAS_ARGBSHUFFLEROW_SSE2
michael@0 5968 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
michael@0 5969 void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
michael@0 5970 const uint8* shuffler, int pix) {
michael@0 5971 uintptr_t pixel_temp = 0u;
michael@0 5972 asm volatile (
michael@0 5973 "pxor %%xmm5,%%xmm5 \n"
michael@0 5974 "mov " MEMACCESS(4) ",%k2 \n"
michael@0 5975 "cmp $0x3000102,%k2 \n"
michael@0 5976 "je 3012f \n"
michael@0 5977 "cmp $0x10203,%k2 \n"
michael@0 5978 "je 123f \n"
michael@0 5979 "cmp $0x30201,%k2 \n"
michael@0 5980 "je 321f \n"
michael@0 5981 "cmp $0x2010003,%k2 \n"
michael@0 5982 "je 2103f \n"
michael@0 5983
michael@0 5984 LABELALIGN
michael@0 5985 "1: \n"
michael@0 5986 "movzb " MEMACCESS(4) ",%2 \n"
michael@0 5987 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
michael@0 5988 "mov %b2," MEMACCESS(1) " \n"
michael@0 5989 "movzb " MEMACCESS2(0x1,4) ",%2 \n"
michael@0 5990 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
michael@0 5991 "mov %b2," MEMACCESS2(0x1,1) " \n"
michael@0 5992 BUNDLEALIGN
michael@0 5993 "movzb " MEMACCESS2(0x2,4) ",%2 \n"
michael@0 5994 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
michael@0 5995 "mov %b2," MEMACCESS2(0x2,1) " \n"
michael@0 5996 "movzb " MEMACCESS2(0x3,4) ",%2 \n"
michael@0 5997 MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2
michael@0 5998 "mov %b2," MEMACCESS2(0x3,1) " \n"
michael@0 5999 "lea " MEMLEA(0x4,0) ",%0 \n"
michael@0 6000 "lea " MEMLEA(0x4,1) ",%1 \n"
michael@0 6001 "sub $0x1,%3 \n"
michael@0 6002 "jg 1b \n"
michael@0 6003 "jmp 99f \n"
michael@0 6004
michael@0 6005 LABELALIGN
michael@0 6006 "123: \n"
michael@0 6007 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 6008 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 6009 "movdqa %%xmm0,%%xmm1 \n"
michael@0 6010 "punpcklbw %%xmm5,%%xmm0 \n"
michael@0 6011 "punpckhbw %%xmm5,%%xmm1 \n"
michael@0 6012 "pshufhw $0x1b,%%xmm0,%%xmm0 \n"
michael@0 6013 "pshuflw $0x1b,%%xmm0,%%xmm0 \n"
michael@0 6014 "pshufhw $0x1b,%%xmm1,%%xmm1 \n"
michael@0 6015 "pshuflw $0x1b,%%xmm1,%%xmm1 \n"
michael@0 6016 "packuswb %%xmm1,%%xmm0 \n"
michael@0 6017 "sub $0x4,%3 \n"
michael@0 6018 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 6019 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 6020 "jg 123b \n"
michael@0 6021 "jmp 99f \n"
michael@0 6022
michael@0 6023 LABELALIGN
michael@0 6024 "321: \n"
michael@0 6025 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 6026 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 6027 "movdqa %%xmm0,%%xmm1 \n"
michael@0 6028 "punpcklbw %%xmm5,%%xmm0 \n"
michael@0 6029 "punpckhbw %%xmm5,%%xmm1 \n"
michael@0 6030 "pshufhw $0x39,%%xmm0,%%xmm0 \n"
michael@0 6031 "pshuflw $0x39,%%xmm0,%%xmm0 \n"
michael@0 6032 "pshufhw $0x39,%%xmm1,%%xmm1 \n"
michael@0 6033 "pshuflw $0x39,%%xmm1,%%xmm1 \n"
michael@0 6034 "packuswb %%xmm1,%%xmm0 \n"
michael@0 6035 "sub $0x4,%3 \n"
michael@0 6036 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 6037 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 6038 "jg 321b \n"
michael@0 6039 "jmp 99f \n"
michael@0 6040
michael@0 6041 LABELALIGN
michael@0 6042 "2103: \n"
michael@0 6043 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 6044 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 6045 "movdqa %%xmm0,%%xmm1 \n"
michael@0 6046 "punpcklbw %%xmm5,%%xmm0 \n"
michael@0 6047 "punpckhbw %%xmm5,%%xmm1 \n"
michael@0 6048 "pshufhw $0x93,%%xmm0,%%xmm0 \n"
michael@0 6049 "pshuflw $0x93,%%xmm0,%%xmm0 \n"
michael@0 6050 "pshufhw $0x93,%%xmm1,%%xmm1 \n"
michael@0 6051 "pshuflw $0x93,%%xmm1,%%xmm1 \n"
michael@0 6052 "packuswb %%xmm1,%%xmm0 \n"
michael@0 6053 "sub $0x4,%3 \n"
michael@0 6054 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 6055 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 6056 "jg 2103b \n"
michael@0 6057 "jmp 99f \n"
michael@0 6058
michael@0 6059 LABELALIGN
michael@0 6060 "3012: \n"
michael@0 6061 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 6062 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 6063 "movdqa %%xmm0,%%xmm1 \n"
michael@0 6064 "punpcklbw %%xmm5,%%xmm0 \n"
michael@0 6065 "punpckhbw %%xmm5,%%xmm1 \n"
michael@0 6066 "pshufhw $0xc6,%%xmm0,%%xmm0 \n"
michael@0 6067 "pshuflw $0xc6,%%xmm0,%%xmm0 \n"
michael@0 6068 "pshufhw $0xc6,%%xmm1,%%xmm1 \n"
michael@0 6069 "pshuflw $0xc6,%%xmm1,%%xmm1 \n"
michael@0 6070 "packuswb %%xmm1,%%xmm0 \n"
michael@0 6071 "sub $0x4,%3 \n"
michael@0 6072 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 6073 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 6074 "jg 3012b \n"
michael@0 6075
michael@0 6076 "99: \n"
michael@0 6077 : "+r"(src_argb), // %0
michael@0 6078 "+r"(dst_argb), // %1
michael@0 6079 "+d"(pixel_temp), // %2
michael@0 6080 "+r"(pix) // %3
michael@0 6081 : "r"(shuffler) // %4
michael@0 6082 : "memory", "cc"
michael@0 6083 #if defined(__native_client__) && defined(__x86_64__)
michael@0 6084 , "r14"
michael@0 6085 #endif
michael@0 6086 #if defined(__SSE2__)
michael@0 6087 , "xmm0", "xmm1", "xmm5"
michael@0 6088 #endif
michael@0 6089 );
michael@0 6090 }
michael@0 6091 #endif // HAS_ARGBSHUFFLEROW_SSE2
michael@0 6092
michael@0 6093 #ifdef HAS_I422TOYUY2ROW_SSE2
michael@0 6094 void I422ToYUY2Row_SSE2(const uint8* src_y,
michael@0 6095 const uint8* src_u,
michael@0 6096 const uint8* src_v,
michael@0 6097 uint8* dst_frame, int width) {
michael@0 6098 asm volatile (
michael@0 6099 "sub %1,%2 \n"
michael@0 6100 LABELALIGN
michael@0 6101 "1: \n"
michael@0 6102 "movq " MEMACCESS(1) ",%%xmm2 \n"
michael@0 6103 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
michael@0 6104 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 6105 "punpcklbw %%xmm3,%%xmm2 \n"
michael@0 6106 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 6107 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 6108 "movdqa %%xmm0,%%xmm1 \n"
michael@0 6109 "punpcklbw %%xmm2,%%xmm0 \n"
michael@0 6110 "punpckhbw %%xmm2,%%xmm1 \n"
michael@0 6111 "movdqu %%xmm0," MEMACCESS(3) " \n"
michael@0 6112 "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n"
michael@0 6113 "lea " MEMLEA(0x20,3) ",%3 \n"
michael@0 6114 "sub $0x10,%4 \n"
michael@0 6115 "jg 1b \n"
michael@0 6116 : "+r"(src_y), // %0
michael@0 6117 "+r"(src_u), // %1
michael@0 6118 "+r"(src_v), // %2
michael@0 6119 "+r"(dst_frame), // %3
michael@0 6120 "+rm"(width) // %4
michael@0 6121 :
michael@0 6122 : "memory", "cc"
michael@0 6123 #if defined(__native_client__) && defined(__x86_64__)
michael@0 6124 , "r14"
michael@0 6125 #endif
michael@0 6126 #if defined(__SSE2__)
michael@0 6127 , "xmm0", "xmm1", "xmm2", "xmm3"
michael@0 6128 #endif
michael@0 6129 );
michael@0 6130 }
michael@0 6131 #endif // HAS_I422TOYUY2ROW_SSE2
michael@0 6132
michael@0 6133 #ifdef HAS_I422TOUYVYROW_SSE2
michael@0 6134 void I422ToUYVYRow_SSE2(const uint8* src_y,
michael@0 6135 const uint8* src_u,
michael@0 6136 const uint8* src_v,
michael@0 6137 uint8* dst_frame, int width) {
michael@0 6138 asm volatile (
michael@0 6139 "sub %1,%2 \n"
michael@0 6140 LABELALIGN
michael@0 6141 "1: \n"
michael@0 6142 "movq " MEMACCESS(1) ",%%xmm2 \n"
michael@0 6143 MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3
michael@0 6144 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 6145 "punpcklbw %%xmm3,%%xmm2 \n"
michael@0 6146 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 6147 "movdqa %%xmm2,%%xmm1 \n"
michael@0 6148 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 6149 "punpcklbw %%xmm0,%%xmm1 \n"
michael@0 6150 "punpckhbw %%xmm0,%%xmm2 \n"
michael@0 6151 "movdqu %%xmm1," MEMACCESS(3) " \n"
michael@0 6152 "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n"
michael@0 6153 "lea " MEMLEA(0x20,3) ",%3 \n"
michael@0 6154 "sub $0x10,%4 \n"
michael@0 6155 "jg 1b \n"
michael@0 6156 : "+r"(src_y), // %0
michael@0 6157 "+r"(src_u), // %1
michael@0 6158 "+r"(src_v), // %2
michael@0 6159 "+r"(dst_frame), // %3
michael@0 6160 "+rm"(width) // %4
michael@0 6161 :
michael@0 6162 : "memory", "cc"
michael@0 6163 #if defined(__native_client__) && defined(__x86_64__)
michael@0 6164 , "r14"
michael@0 6165 #endif
michael@0 6166 #if defined(__SSE2__)
michael@0 6167 , "xmm0", "xmm1", "xmm2", "xmm3"
michael@0 6168 #endif
michael@0 6169 );
michael@0 6170 }
michael@0 6171 #endif // HAS_I422TOUYVYROW_SSE2
michael@0 6172
michael@0 6173 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
michael@0 6174 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
michael@0 6175 uint8* dst_argb, const float* poly,
michael@0 6176 int width) {
michael@0 6177 asm volatile (
michael@0 6178 "pxor %%xmm3,%%xmm3 \n"
michael@0 6179
michael@0 6180 // 2 pixel loop.
michael@0 6181 LABELALIGN
michael@0 6182 "1: \n"
michael@0 6183 "movq " MEMACCESS(0) ",%%xmm0 \n"
michael@0 6184 "lea " MEMLEA(0x8,0) ",%0 \n"
michael@0 6185 "punpcklbw %%xmm3,%%xmm0 \n"
michael@0 6186 "movdqa %%xmm0,%%xmm4 \n"
michael@0 6187 "punpcklwd %%xmm3,%%xmm0 \n"
michael@0 6188 "punpckhwd %%xmm3,%%xmm4 \n"
michael@0 6189 "cvtdq2ps %%xmm0,%%xmm0 \n"
michael@0 6190 "cvtdq2ps %%xmm4,%%xmm4 \n"
michael@0 6191 "movdqa %%xmm0,%%xmm1 \n"
michael@0 6192 "movdqa %%xmm4,%%xmm5 \n"
michael@0 6193 "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n"
michael@0 6194 "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n"
michael@0 6195 "addps " MEMACCESS(3) ",%%xmm0 \n"
michael@0 6196 "addps " MEMACCESS(3) ",%%xmm4 \n"
michael@0 6197 "movdqa %%xmm1,%%xmm2 \n"
michael@0 6198 "movdqa %%xmm5,%%xmm6 \n"
michael@0 6199 "mulps %%xmm1,%%xmm2 \n"
michael@0 6200 "mulps %%xmm5,%%xmm6 \n"
michael@0 6201 "mulps %%xmm2,%%xmm1 \n"
michael@0 6202 "mulps %%xmm6,%%xmm5 \n"
michael@0 6203 "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n"
michael@0 6204 "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n"
michael@0 6205 "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n"
michael@0 6206 "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n"
michael@0 6207 "addps %%xmm2,%%xmm0 \n"
michael@0 6208 "addps %%xmm6,%%xmm4 \n"
michael@0 6209 "addps %%xmm1,%%xmm0 \n"
michael@0 6210 "addps %%xmm5,%%xmm4 \n"
michael@0 6211 "cvttps2dq %%xmm0,%%xmm0 \n"
michael@0 6212 "cvttps2dq %%xmm4,%%xmm4 \n"
michael@0 6213 "packuswb %%xmm4,%%xmm0 \n"
michael@0 6214 "packuswb %%xmm0,%%xmm0 \n"
michael@0 6215 "sub $0x2,%2 \n"
michael@0 6216 "movq %%xmm0," MEMACCESS(1) " \n"
michael@0 6217 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 6218 "jg 1b \n"
michael@0 6219 : "+r"(src_argb), // %0
michael@0 6220 "+r"(dst_argb), // %1
michael@0 6221 "+r"(width) // %2
michael@0 6222 : "r"(poly) // %3
michael@0 6223 : "memory", "cc"
michael@0 6224 #if defined(__SSE2__)
michael@0 6225 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
michael@0 6226 #endif
michael@0 6227 );
michael@0 6228 }
michael@0 6229 #endif // HAS_ARGBPOLYNOMIALROW_SSE2
michael@0 6230
michael@0 6231 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2
michael@0 6232 void ARGBPolynomialRow_AVX2(const uint8* src_argb,
michael@0 6233 uint8* dst_argb, const float* poly,
michael@0 6234 int width) {
michael@0 6235 asm volatile (
michael@0 6236 "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n"
michael@0 6237 "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
michael@0 6238 "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
michael@0 6239 "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
michael@0 6240
michael@0 6241 // 2 pixel loop.
michael@0 6242 LABELALIGN
michael@0 6243 "1: \n"
michael@0 6244 "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels
michael@0 6245 "lea " MEMLEA(0x8,0) ",%0 \n"
michael@0 6246 "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats
michael@0 6247 "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X
michael@0 6248 "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X
michael@0 6249 "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X
michael@0 6250 "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X
michael@0 6251 "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X
michael@0 6252 "vcvttps2dq %%ymm0,%%ymm0 \n"
michael@0 6253 "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n"
michael@0 6254 "vpermq $0xd8,%%ymm0,%%ymm0 \n"
michael@0 6255 "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n"
michael@0 6256 "sub $0x2,%2 \n"
michael@0 6257 "vmovq %%xmm0," MEMACCESS(1) " \n"
michael@0 6258 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 6259 "jg 1b \n"
michael@0 6260 "vzeroupper \n"
michael@0 6261 : "+r"(src_argb), // %0
michael@0 6262 "+r"(dst_argb), // %1
michael@0 6263 "+r"(width) // %2
michael@0 6264 : "r"(poly) // %3
michael@0 6265 : "memory", "cc"
michael@0 6266 #if defined(__SSE2__)
michael@0 6267 // TODO(fbarchard): declare ymm usage when applicable.
michael@0 6268 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
michael@0 6269 #endif
michael@0 6270 );
michael@0 6271 }
michael@0 6272 #endif // HAS_ARGBPOLYNOMIALROW_AVX2
michael@0 6273
michael@0 6274 #ifdef HAS_ARGBCOLORTABLEROW_X86
michael@0 6275 // Tranform ARGB pixels with color table.
michael@0 6276 void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
michael@0 6277 int width) {
michael@0 6278 uintptr_t pixel_temp = 0u;
michael@0 6279 asm volatile (
michael@0 6280 // 1 pixel loop.
michael@0 6281 LABELALIGN
michael@0 6282 "1: \n"
michael@0 6283 "movzb " MEMACCESS(0) ",%1 \n"
michael@0 6284 "lea " MEMLEA(0x4,0) ",%0 \n"
michael@0 6285 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
michael@0 6286 "mov %b1," MEMACCESS2(-0x4,0) " \n"
michael@0 6287 "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
michael@0 6288 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
michael@0 6289 "mov %b1," MEMACCESS2(-0x3,0) " \n"
michael@0 6290 "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
michael@0 6291 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
michael@0 6292 "mov %b1," MEMACCESS2(-0x2,0) " \n"
michael@0 6293 "movzb " MEMACCESS2(-0x1,0) ",%1 \n"
michael@0 6294 MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1
michael@0 6295 "mov %b1," MEMACCESS2(-0x1,0) " \n"
michael@0 6296 "dec %2 \n"
michael@0 6297 "jg 1b \n"
michael@0 6298 : "+r"(dst_argb), // %0
michael@0 6299 "+d"(pixel_temp), // %1
michael@0 6300 "+r"(width) // %2
michael@0 6301 : "r"(table_argb) // %3
michael@0 6302 : "memory", "cc");
michael@0 6303 }
michael@0 6304 #endif // HAS_ARGBCOLORTABLEROW_X86
michael@0 6305
michael@0 6306 #ifdef HAS_RGBCOLORTABLEROW_X86
michael@0 6307 // Tranform RGB pixels with color table.
michael@0 6308 void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
michael@0 6309 uintptr_t pixel_temp = 0u;
michael@0 6310 asm volatile (
michael@0 6311 // 1 pixel loop.
michael@0 6312 LABELALIGN
michael@0 6313 "1: \n"
michael@0 6314 "movzb " MEMACCESS(0) ",%1 \n"
michael@0 6315 "lea " MEMLEA(0x4,0) ",%0 \n"
michael@0 6316 MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1
michael@0 6317 "mov %b1," MEMACCESS2(-0x4,0) " \n"
michael@0 6318 "movzb " MEMACCESS2(-0x3,0) ",%1 \n"
michael@0 6319 MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1
michael@0 6320 "mov %b1," MEMACCESS2(-0x3,0) " \n"
michael@0 6321 "movzb " MEMACCESS2(-0x2,0) ",%1 \n"
michael@0 6322 MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1
michael@0 6323 "mov %b1," MEMACCESS2(-0x2,0) " \n"
michael@0 6324 "dec %2 \n"
michael@0 6325 "jg 1b \n"
michael@0 6326 : "+r"(dst_argb), // %0
michael@0 6327 "+d"(pixel_temp), // %1
michael@0 6328 "+r"(width) // %2
michael@0 6329 : "r"(table_argb) // %3
michael@0 6330 : "memory", "cc");
michael@0 6331 }
michael@0 6332 #endif // HAS_RGBCOLORTABLEROW_X86
michael@0 6333
michael@0 6334 #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
michael@0 6335 // Tranform RGB pixels with luma table.
michael@0 6336 void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
michael@0 6337 int width,
michael@0 6338 const uint8* luma, uint32 lumacoeff) {
michael@0 6339 uintptr_t pixel_temp = 0u;
michael@0 6340 uintptr_t table_temp = 0u;
michael@0 6341 asm volatile (
michael@0 6342 "movd %6,%%xmm3 \n"
michael@0 6343 "pshufd $0x0,%%xmm3,%%xmm3 \n"
michael@0 6344 "pcmpeqb %%xmm4,%%xmm4 \n"
michael@0 6345 "psllw $0x8,%%xmm4 \n"
michael@0 6346 "pxor %%xmm5,%%xmm5 \n"
michael@0 6347
michael@0 6348 // 4 pixel loop.
michael@0 6349 LABELALIGN
michael@0 6350 "1: \n"
michael@0 6351 "movdqu " MEMACCESS(2) ",%%xmm0 \n"
michael@0 6352 "pmaddubsw %%xmm3,%%xmm0 \n"
michael@0 6353 "phaddw %%xmm0,%%xmm0 \n"
michael@0 6354 "pand %%xmm4,%%xmm0 \n"
michael@0 6355 "punpcklwd %%xmm5,%%xmm0 \n"
michael@0 6356 "movd %%xmm0,%k1 \n" // 32 bit offset
michael@0 6357 "add %5,%1 \n"
michael@0 6358 "pshufd $0x39,%%xmm0,%%xmm0 \n"
michael@0 6359
michael@0 6360 "movzb " MEMACCESS(2) ",%0 \n"
michael@0 6361 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
michael@0 6362 "mov %b0," MEMACCESS(3) " \n"
michael@0 6363 "movzb " MEMACCESS2(0x1,2) ",%0 \n"
michael@0 6364 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
michael@0 6365 "mov %b0," MEMACCESS2(0x1,3) " \n"
michael@0 6366 "movzb " MEMACCESS2(0x2,2) ",%0 \n"
michael@0 6367 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
michael@0 6368 "mov %b0," MEMACCESS2(0x2,3) " \n"
michael@0 6369 "movzb " MEMACCESS2(0x3,2) ",%0 \n"
michael@0 6370 "mov %b0," MEMACCESS2(0x3,3) " \n"
michael@0 6371
michael@0 6372 "movd %%xmm0,%k1 \n" // 32 bit offset
michael@0 6373 "add %5,%1 \n"
michael@0 6374 "pshufd $0x39,%%xmm0,%%xmm0 \n"
michael@0 6375
michael@0 6376 "movzb " MEMACCESS2(0x4,2) ",%0 \n"
michael@0 6377 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
michael@0 6378 "mov %b0," MEMACCESS2(0x4,3) " \n"
michael@0 6379 BUNDLEALIGN
michael@0 6380 "movzb " MEMACCESS2(0x5,2) ",%0 \n"
michael@0 6381 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
michael@0 6382 "mov %b0," MEMACCESS2(0x5,3) " \n"
michael@0 6383 "movzb " MEMACCESS2(0x6,2) ",%0 \n"
michael@0 6384 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
michael@0 6385 "mov %b0," MEMACCESS2(0x6,3) " \n"
michael@0 6386 "movzb " MEMACCESS2(0x7,2) ",%0 \n"
michael@0 6387 "mov %b0," MEMACCESS2(0x7,3) " \n"
michael@0 6388
michael@0 6389 "movd %%xmm0,%k1 \n" // 32 bit offset
michael@0 6390 "add %5,%1 \n"
michael@0 6391 "pshufd $0x39,%%xmm0,%%xmm0 \n"
michael@0 6392
michael@0 6393 "movzb " MEMACCESS2(0x8,2) ",%0 \n"
michael@0 6394 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
michael@0 6395 "mov %b0," MEMACCESS2(0x8,3) " \n"
michael@0 6396 "movzb " MEMACCESS2(0x9,2) ",%0 \n"
michael@0 6397 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
michael@0 6398 "mov %b0," MEMACCESS2(0x9,3) " \n"
michael@0 6399 "movzb " MEMACCESS2(0xa,2) ",%0 \n"
michael@0 6400 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
michael@0 6401 "mov %b0," MEMACCESS2(0xa,3) " \n"
michael@0 6402 "movzb " MEMACCESS2(0xb,2) ",%0 \n"
michael@0 6403 "mov %b0," MEMACCESS2(0xb,3) " \n"
michael@0 6404
michael@0 6405 "movd %%xmm0,%k1 \n" // 32 bit offset
michael@0 6406 "add %5,%1 \n"
michael@0 6407
michael@0 6408 "movzb " MEMACCESS2(0xc,2) ",%0 \n"
michael@0 6409 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
michael@0 6410 "mov %b0," MEMACCESS2(0xc,3) " \n"
michael@0 6411 "movzb " MEMACCESS2(0xd,2) ",%0 \n"
michael@0 6412 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
michael@0 6413 "mov %b0," MEMACCESS2(0xd,3) " \n"
michael@0 6414 "movzb " MEMACCESS2(0xe,2) ",%0 \n"
michael@0 6415 MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0
michael@0 6416 "mov %b0," MEMACCESS2(0xe,3) " \n"
michael@0 6417 "movzb " MEMACCESS2(0xf,2) ",%0 \n"
michael@0 6418 "mov %b0," MEMACCESS2(0xf,3) " \n"
michael@0 6419 "sub $0x4,%4 \n"
michael@0 6420 "lea " MEMLEA(0x10,2) ",%2 \n"
michael@0 6421 "lea " MEMLEA(0x10,3) ",%3 \n"
michael@0 6422 "jg 1b \n"
michael@0 6423 : "+d"(pixel_temp), // %0
michael@0 6424 "+a"(table_temp), // %1
michael@0 6425 "+r"(src_argb), // %2
michael@0 6426 "+r"(dst_argb), // %3
michael@0 6427 "+rm"(width) // %4
michael@0 6428 : "r"(luma), // %5
michael@0 6429 "rm"(lumacoeff) // %6
michael@0 6430 : "memory", "cc"
michael@0 6431 #if defined(__SSE2__)
michael@0 6432 , "xmm0", "xmm3", "xmm4", "xmm5"
michael@0 6433 #endif
michael@0 6434 );
michael@0 6435 }
michael@0 6436 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
michael@0 6437
michael@0 6438 #endif // defined(__x86_64__) || defined(__i386__)
michael@0 6439
michael@0 6440 #ifdef __cplusplus
michael@0 6441 } // extern "C"
michael@0 6442 } // namespace libyuv
michael@0 6443 #endif

mercurial