Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
michael@0 | 3 | * |
michael@0 | 4 | * Use of this source code is governed by a BSD-style license |
michael@0 | 5 | * that can be found in the LICENSE file in the root of the source |
michael@0 | 6 | * tree. An additional intellectual property rights grant can be found |
michael@0 | 7 | * in the file PATENTS. All contributing project authors may |
michael@0 | 8 | * be found in the AUTHORS file in the root of the source tree. |
michael@0 | 9 | */ |
michael@0 | 10 | |
michael@0 | 11 | #include "libyuv/row.h" |
michael@0 | 12 | |
michael@0 | 13 | #ifdef __cplusplus |
michael@0 | 14 | namespace libyuv { |
michael@0 | 15 | extern "C" { |
michael@0 | 16 | #endif |
michael@0 | 17 | |
michael@0 | 18 | // This module is for GCC x86 and x64. |
michael@0 | 19 | #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) |
michael@0 | 20 | |
michael@0 | 21 | #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) |
michael@0 | 22 | |
michael@0 | 23 | // Constants for ARGB |
michael@0 | 24 | static vec8 kARGBToY = { |
michael@0 | 25 | 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 |
michael@0 | 26 | }; |
michael@0 | 27 | |
michael@0 | 28 | // JPeg full range. |
michael@0 | 29 | static vec8 kARGBToYJ = { |
michael@0 | 30 | 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 |
michael@0 | 31 | }; |
michael@0 | 32 | #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) |
michael@0 | 33 | |
michael@0 | 34 | #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) |
michael@0 | 35 | |
michael@0 | 36 | static vec8 kARGBToU = { |
michael@0 | 37 | 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 |
michael@0 | 38 | }; |
michael@0 | 39 | |
michael@0 | 40 | static vec8 kARGBToUJ = { |
michael@0 | 41 | 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0 |
michael@0 | 42 | }; |
michael@0 | 43 | |
michael@0 | 44 | static vec8 kARGBToV = { |
michael@0 | 45 | -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, |
michael@0 | 46 | }; |
michael@0 | 47 | |
michael@0 | 48 | static vec8 kARGBToVJ = { |
michael@0 | 49 | -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0 |
michael@0 | 50 | }; |
michael@0 | 51 | |
michael@0 | 52 | // Constants for BGRA |
michael@0 | 53 | static vec8 kBGRAToY = { |
michael@0 | 54 | 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 |
michael@0 | 55 | }; |
michael@0 | 56 | |
michael@0 | 57 | static vec8 kBGRAToU = { |
michael@0 | 58 | 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 |
michael@0 | 59 | }; |
michael@0 | 60 | |
michael@0 | 61 | static vec8 kBGRAToV = { |
michael@0 | 62 | 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 |
michael@0 | 63 | }; |
michael@0 | 64 | |
michael@0 | 65 | // Constants for ABGR |
michael@0 | 66 | static vec8 kABGRToY = { |
michael@0 | 67 | 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 |
michael@0 | 68 | }; |
michael@0 | 69 | |
michael@0 | 70 | static vec8 kABGRToU = { |
michael@0 | 71 | -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 |
michael@0 | 72 | }; |
michael@0 | 73 | |
michael@0 | 74 | static vec8 kABGRToV = { |
michael@0 | 75 | 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 |
michael@0 | 76 | }; |
michael@0 | 77 | |
michael@0 | 78 | // Constants for RGBA. |
michael@0 | 79 | static vec8 kRGBAToY = { |
michael@0 | 80 | 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 |
michael@0 | 81 | }; |
michael@0 | 82 | |
michael@0 | 83 | static vec8 kRGBAToU = { |
michael@0 | 84 | 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 |
michael@0 | 85 | }; |
michael@0 | 86 | |
michael@0 | 87 | static vec8 kRGBAToV = { |
michael@0 | 88 | 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 |
michael@0 | 89 | }; |
michael@0 | 90 | |
michael@0 | 91 | static uvec8 kAddY16 = { |
michael@0 | 92 | 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u |
michael@0 | 93 | }; |
michael@0 | 94 | |
michael@0 | 95 | static vec16 kAddYJ64 = { |
michael@0 | 96 | 64, 64, 64, 64, 64, 64, 64, 64 |
michael@0 | 97 | }; |
michael@0 | 98 | |
michael@0 | 99 | static uvec8 kAddUV128 = { |
michael@0 | 100 | 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, |
michael@0 | 101 | 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u |
michael@0 | 102 | }; |
michael@0 | 103 | |
michael@0 | 104 | static uvec16 kAddUVJ128 = { |
michael@0 | 105 | 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u |
michael@0 | 106 | }; |
michael@0 | 107 | #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) |
michael@0 | 108 | |
michael@0 | 109 | #ifdef HAS_RGB24TOARGBROW_SSSE3 |
michael@0 | 110 | |
michael@0 | 111 | // Shuffle table for converting RGB24 to ARGB. |
michael@0 | 112 | static uvec8 kShuffleMaskRGB24ToARGB = { |
michael@0 | 113 | 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u |
michael@0 | 114 | }; |
michael@0 | 115 | |
michael@0 | 116 | // Shuffle table for converting RAW to ARGB. |
michael@0 | 117 | static uvec8 kShuffleMaskRAWToARGB = { |
michael@0 | 118 | 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u |
michael@0 | 119 | }; |
michael@0 | 120 | |
michael@0 | 121 | // Shuffle table for converting ARGB to RGB24. |
michael@0 | 122 | static uvec8 kShuffleMaskARGBToRGB24 = { |
michael@0 | 123 | 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u |
michael@0 | 124 | }; |
michael@0 | 125 | |
michael@0 | 126 | // Shuffle table for converting ARGB to RAW. |
michael@0 | 127 | static uvec8 kShuffleMaskARGBToRAW = { |
michael@0 | 128 | 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u |
michael@0 | 129 | }; |
michael@0 | 130 | |
michael@0 | 131 | // Shuffle table for converting ARGBToRGB24 for I422ToRGB24. First 8 + next 4 |
michael@0 | 132 | static uvec8 kShuffleMaskARGBToRGB24_0 = { |
michael@0 | 133 | 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u |
michael@0 | 134 | }; |
michael@0 | 135 | |
michael@0 | 136 | // Shuffle table for converting ARGB to RAW. |
michael@0 | 137 | static uvec8 kShuffleMaskARGBToRAW_0 = { |
michael@0 | 138 | 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u |
michael@0 | 139 | }; |
michael@0 | 140 | #endif // HAS_RGB24TOARGBROW_SSSE3 |
michael@0 | 141 | |
michael@0 | 142 | #if defined(TESTING) && defined(__x86_64__) |
michael@0 | 143 | void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
michael@0 | 144 | asm volatile ( |
michael@0 | 145 | ".p2align 5 \n" |
michael@0 | 146 | "mov %%eax,%%eax \n" |
michael@0 | 147 | "mov %%ebx,%%ebx \n" |
michael@0 | 148 | "mov %%ecx,%%ecx \n" |
michael@0 | 149 | "mov %%edx,%%edx \n" |
michael@0 | 150 | "mov %%esi,%%esi \n" |
michael@0 | 151 | "mov %%edi,%%edi \n" |
michael@0 | 152 | "mov %%ebp,%%ebp \n" |
michael@0 | 153 | "mov %%esp,%%esp \n" |
michael@0 | 154 | ".p2align 5 \n" |
michael@0 | 155 | "mov %%r8d,%%r8d \n" |
michael@0 | 156 | "mov %%r9d,%%r9d \n" |
michael@0 | 157 | "mov %%r10d,%%r10d \n" |
michael@0 | 158 | "mov %%r11d,%%r11d \n" |
michael@0 | 159 | "mov %%r12d,%%r12d \n" |
michael@0 | 160 | "mov %%r13d,%%r13d \n" |
michael@0 | 161 | "mov %%r14d,%%r14d \n" |
michael@0 | 162 | "mov %%r15d,%%r15d \n" |
michael@0 | 163 | ".p2align 5 \n" |
michael@0 | 164 | "lea (%%rax),%%eax \n" |
michael@0 | 165 | "lea (%%rbx),%%ebx \n" |
michael@0 | 166 | "lea (%%rcx),%%ecx \n" |
michael@0 | 167 | "lea (%%rdx),%%edx \n" |
michael@0 | 168 | "lea (%%rsi),%%esi \n" |
michael@0 | 169 | "lea (%%rdi),%%edi \n" |
michael@0 | 170 | "lea (%%rbp),%%ebp \n" |
michael@0 | 171 | "lea (%%rsp),%%esp \n" |
michael@0 | 172 | ".p2align 5 \n" |
michael@0 | 173 | "lea (%%r8),%%r8d \n" |
michael@0 | 174 | "lea (%%r9),%%r9d \n" |
michael@0 | 175 | "lea (%%r10),%%r10d \n" |
michael@0 | 176 | "lea (%%r11),%%r11d \n" |
michael@0 | 177 | "lea (%%r12),%%r12d \n" |
michael@0 | 178 | "lea (%%r13),%%r13d \n" |
michael@0 | 179 | "lea (%%r14),%%r14d \n" |
michael@0 | 180 | "lea (%%r15),%%r15d \n" |
michael@0 | 181 | |
michael@0 | 182 | ".p2align 5 \n" |
michael@0 | 183 | "lea 0x10(%%rax),%%eax \n" |
michael@0 | 184 | "lea 0x10(%%rbx),%%ebx \n" |
michael@0 | 185 | "lea 0x10(%%rcx),%%ecx \n" |
michael@0 | 186 | "lea 0x10(%%rdx),%%edx \n" |
michael@0 | 187 | "lea 0x10(%%rsi),%%esi \n" |
michael@0 | 188 | "lea 0x10(%%rdi),%%edi \n" |
michael@0 | 189 | "lea 0x10(%%rbp),%%ebp \n" |
michael@0 | 190 | "lea 0x10(%%rsp),%%esp \n" |
michael@0 | 191 | ".p2align 5 \n" |
michael@0 | 192 | "lea 0x10(%%r8),%%r8d \n" |
michael@0 | 193 | "lea 0x10(%%r9),%%r9d \n" |
michael@0 | 194 | "lea 0x10(%%r10),%%r10d \n" |
michael@0 | 195 | "lea 0x10(%%r11),%%r11d \n" |
michael@0 | 196 | "lea 0x10(%%r12),%%r12d \n" |
michael@0 | 197 | "lea 0x10(%%r13),%%r13d \n" |
michael@0 | 198 | "lea 0x10(%%r14),%%r14d \n" |
michael@0 | 199 | "lea 0x10(%%r15),%%r15d \n" |
michael@0 | 200 | |
michael@0 | 201 | ".p2align 5 \n" |
michael@0 | 202 | "add 0x10,%%eax \n" |
michael@0 | 203 | "add 0x10,%%ebx \n" |
michael@0 | 204 | "add 0x10,%%ecx \n" |
michael@0 | 205 | "add 0x10,%%edx \n" |
michael@0 | 206 | "add 0x10,%%esi \n" |
michael@0 | 207 | "add 0x10,%%edi \n" |
michael@0 | 208 | "add 0x10,%%ebp \n" |
michael@0 | 209 | "add 0x10,%%esp \n" |
michael@0 | 210 | ".p2align 5 \n" |
michael@0 | 211 | "add 0x10,%%r8d \n" |
michael@0 | 212 | "add 0x10,%%r9d \n" |
michael@0 | 213 | "add 0x10,%%r10d \n" |
michael@0 | 214 | "add 0x10,%%r11d \n" |
michael@0 | 215 | "add 0x10,%%r12d \n" |
michael@0 | 216 | "add 0x10,%%r13d \n" |
michael@0 | 217 | "add 0x10,%%r14d \n" |
michael@0 | 218 | "add 0x10,%%r15d \n" |
michael@0 | 219 | |
michael@0 | 220 | ".p2align 2 \n" |
michael@0 | 221 | "1: \n" |
michael@0 | 222 | "movq " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 223 | "lea " MEMLEA(0x8,0) ",%0 \n" |
michael@0 | 224 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 225 | "lea " MEMLEA(0x20,1) ",%1 \n" |
michael@0 | 226 | "sub $0x8,%2 \n" |
michael@0 | 227 | "jg 1b \n" |
michael@0 | 228 | : "+r"(src_y), // %0 |
michael@0 | 229 | "+r"(dst_argb), // %1 |
michael@0 | 230 | "+r"(pix) // %2 |
michael@0 | 231 | : |
michael@0 | 232 | : "memory", "cc" |
michael@0 | 233 | #if defined(__SSE2__) |
michael@0 | 234 | , "xmm0", "xmm1", "xmm5" |
michael@0 | 235 | #endif |
michael@0 | 236 | ); |
michael@0 | 237 | } |
michael@0 | 238 | #endif // TESTING |
michael@0 | 239 | |
michael@0 | 240 | #ifdef HAS_I400TOARGBROW_SSE2 |
michael@0 | 241 | void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { |
michael@0 | 242 | asm volatile ( |
michael@0 | 243 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 244 | "pslld $0x18,%%xmm5 \n" |
michael@0 | 245 | LABELALIGN |
michael@0 | 246 | "1: \n" |
michael@0 | 247 | "movq " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 248 | "lea " MEMLEA(0x8,0) ",%0 \n" |
michael@0 | 249 | "punpcklbw %%xmm0,%%xmm0 \n" |
michael@0 | 250 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 251 | "punpcklwd %%xmm0,%%xmm0 \n" |
michael@0 | 252 | "punpckhwd %%xmm1,%%xmm1 \n" |
michael@0 | 253 | "por %%xmm5,%%xmm0 \n" |
michael@0 | 254 | "por %%xmm5,%%xmm1 \n" |
michael@0 | 255 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 256 | "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" |
michael@0 | 257 | "lea " MEMLEA(0x20,1) ",%1 \n" |
michael@0 | 258 | "sub $0x8,%2 \n" |
michael@0 | 259 | "jg 1b \n" |
michael@0 | 260 | : "+r"(src_y), // %0 |
michael@0 | 261 | "+r"(dst_argb), // %1 |
michael@0 | 262 | "+r"(pix) // %2 |
michael@0 | 263 | : |
michael@0 | 264 | : "memory", "cc" |
michael@0 | 265 | #if defined(__SSE2__) |
michael@0 | 266 | , "xmm0", "xmm1", "xmm5" |
michael@0 | 267 | #endif |
michael@0 | 268 | ); |
michael@0 | 269 | } |
michael@0 | 270 | |
michael@0 | 271 | void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, |
michael@0 | 272 | int pix) { |
michael@0 | 273 | asm volatile ( |
michael@0 | 274 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 275 | "pslld $0x18,%%xmm5 \n" |
michael@0 | 276 | LABELALIGN |
michael@0 | 277 | "1: \n" |
michael@0 | 278 | "movq " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 279 | "lea " MEMLEA(0x8,0) ",%0 \n" |
michael@0 | 280 | "punpcklbw %%xmm0,%%xmm0 \n" |
michael@0 | 281 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 282 | "punpcklwd %%xmm0,%%xmm0 \n" |
michael@0 | 283 | "punpckhwd %%xmm1,%%xmm1 \n" |
michael@0 | 284 | "por %%xmm5,%%xmm0 \n" |
michael@0 | 285 | "por %%xmm5,%%xmm1 \n" |
michael@0 | 286 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 287 | "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
michael@0 | 288 | "lea " MEMLEA(0x20,1) ",%1 \n" |
michael@0 | 289 | "sub $0x8,%2 \n" |
michael@0 | 290 | "jg 1b \n" |
michael@0 | 291 | : "+r"(src_y), // %0 |
michael@0 | 292 | "+r"(dst_argb), // %1 |
michael@0 | 293 | "+r"(pix) // %2 |
michael@0 | 294 | : |
michael@0 | 295 | : "memory", "cc" |
michael@0 | 296 | #if defined(__SSE2__) |
michael@0 | 297 | , "xmm0", "xmm1", "xmm5" |
michael@0 | 298 | #endif |
michael@0 | 299 | ); |
michael@0 | 300 | } |
michael@0 | 301 | #endif // HAS_I400TOARGBROW_SSE2 |
michael@0 | 302 | |
michael@0 | 303 | #ifdef HAS_RGB24TOARGBROW_SSSE3 |
michael@0 | 304 | void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { |
michael@0 | 305 | asm volatile ( |
michael@0 | 306 | "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 |
michael@0 | 307 | "pslld $0x18,%%xmm5 \n" |
michael@0 | 308 | "movdqa %3,%%xmm4 \n" |
michael@0 | 309 | LABELALIGN |
michael@0 | 310 | "1: \n" |
michael@0 | 311 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 312 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 313 | "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" |
michael@0 | 314 | "lea " MEMLEA(0x30,0) ",%0 \n" |
michael@0 | 315 | "movdqa %%xmm3,%%xmm2 \n" |
michael@0 | 316 | "palignr $0x8,%%xmm1,%%xmm2 \n" |
michael@0 | 317 | "pshufb %%xmm4,%%xmm2 \n" |
michael@0 | 318 | "por %%xmm5,%%xmm2 \n" |
michael@0 | 319 | "palignr $0xc,%%xmm0,%%xmm1 \n" |
michael@0 | 320 | "pshufb %%xmm4,%%xmm0 \n" |
michael@0 | 321 | "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" |
michael@0 | 322 | "por %%xmm5,%%xmm0 \n" |
michael@0 | 323 | "pshufb %%xmm4,%%xmm1 \n" |
michael@0 | 324 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 325 | "por %%xmm5,%%xmm1 \n" |
michael@0 | 326 | "palignr $0x4,%%xmm3,%%xmm3 \n" |
michael@0 | 327 | "pshufb %%xmm4,%%xmm3 \n" |
michael@0 | 328 | "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" |
michael@0 | 329 | "por %%xmm5,%%xmm3 \n" |
michael@0 | 330 | "sub $0x10,%2 \n" |
michael@0 | 331 | "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" |
michael@0 | 332 | "lea " MEMLEA(0x40,1) ",%1 \n" |
michael@0 | 333 | "jg 1b \n" |
michael@0 | 334 | : "+r"(src_rgb24), // %0 |
michael@0 | 335 | "+r"(dst_argb), // %1 |
michael@0 | 336 | "+r"(pix) // %2 |
michael@0 | 337 | : "m"(kShuffleMaskRGB24ToARGB) // %3 |
michael@0 | 338 | : "memory", "cc" |
michael@0 | 339 | #if defined(__SSE2__) |
michael@0 | 340 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 341 | #endif |
michael@0 | 342 | ); |
michael@0 | 343 | } |
michael@0 | 344 | |
michael@0 | 345 | void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { |
michael@0 | 346 | asm volatile ( |
michael@0 | 347 | "pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000 |
michael@0 | 348 | "pslld $0x18,%%xmm5 \n" |
michael@0 | 349 | "movdqa %3,%%xmm4 \n" |
michael@0 | 350 | LABELALIGN |
michael@0 | 351 | "1: \n" |
michael@0 | 352 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 353 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 354 | "movdqu " MEMACCESS2(0x20,0) ",%%xmm3 \n" |
michael@0 | 355 | "lea " MEMLEA(0x30,0) ",%0 \n" |
michael@0 | 356 | "movdqa %%xmm3,%%xmm2 \n" |
michael@0 | 357 | "palignr $0x8,%%xmm1,%%xmm2 \n" |
michael@0 | 358 | "pshufb %%xmm4,%%xmm2 \n" |
michael@0 | 359 | "por %%xmm5,%%xmm2 \n" |
michael@0 | 360 | "palignr $0xc,%%xmm0,%%xmm1 \n" |
michael@0 | 361 | "pshufb %%xmm4,%%xmm0 \n" |
michael@0 | 362 | "movdqa %%xmm2," MEMACCESS2(0x20,1) " \n" |
michael@0 | 363 | "por %%xmm5,%%xmm0 \n" |
michael@0 | 364 | "pshufb %%xmm4,%%xmm1 \n" |
michael@0 | 365 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 366 | "por %%xmm5,%%xmm1 \n" |
michael@0 | 367 | "palignr $0x4,%%xmm3,%%xmm3 \n" |
michael@0 | 368 | "pshufb %%xmm4,%%xmm3 \n" |
michael@0 | 369 | "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" |
michael@0 | 370 | "por %%xmm5,%%xmm3 \n" |
michael@0 | 371 | "sub $0x10,%2 \n" |
michael@0 | 372 | "movdqa %%xmm3," MEMACCESS2(0x30,1) " \n" |
michael@0 | 373 | "lea " MEMLEA(0x40,1) ",%1 \n" |
michael@0 | 374 | "jg 1b \n" |
michael@0 | 375 | : "+r"(src_raw), // %0 |
michael@0 | 376 | "+r"(dst_argb), // %1 |
michael@0 | 377 | "+r"(pix) // %2 |
michael@0 | 378 | : "m"(kShuffleMaskRAWToARGB) // %3 |
michael@0 | 379 | : "memory", "cc" |
michael@0 | 380 | #if defined(__SSE2__) |
michael@0 | 381 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 382 | #endif |
michael@0 | 383 | ); |
michael@0 | 384 | } |
michael@0 | 385 | |
michael@0 | 386 | void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { |
michael@0 | 387 | asm volatile ( |
michael@0 | 388 | "mov $0x1080108,%%eax \n" |
michael@0 | 389 | "movd %%eax,%%xmm5 \n" |
michael@0 | 390 | "pshufd $0x0,%%xmm5,%%xmm5 \n" |
michael@0 | 391 | "mov $0x20802080,%%eax \n" |
michael@0 | 392 | "movd %%eax,%%xmm6 \n" |
michael@0 | 393 | "pshufd $0x0,%%xmm6,%%xmm6 \n" |
michael@0 | 394 | "pcmpeqb %%xmm3,%%xmm3 \n" |
michael@0 | 395 | "psllw $0xb,%%xmm3 \n" |
michael@0 | 396 | "pcmpeqb %%xmm4,%%xmm4 \n" |
michael@0 | 397 | "psllw $0xa,%%xmm4 \n" |
michael@0 | 398 | "psrlw $0x5,%%xmm4 \n" |
michael@0 | 399 | "pcmpeqb %%xmm7,%%xmm7 \n" |
michael@0 | 400 | "psllw $0x8,%%xmm7 \n" |
michael@0 | 401 | "sub %0,%1 \n" |
michael@0 | 402 | "sub %0,%1 \n" |
michael@0 | 403 | LABELALIGN |
michael@0 | 404 | "1: \n" |
michael@0 | 405 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 406 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 407 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 408 | "pand %%xmm3,%%xmm1 \n" |
michael@0 | 409 | "psllw $0xb,%%xmm2 \n" |
michael@0 | 410 | "pmulhuw %%xmm5,%%xmm1 \n" |
michael@0 | 411 | "pmulhuw %%xmm5,%%xmm2 \n" |
michael@0 | 412 | "psllw $0x8,%%xmm1 \n" |
michael@0 | 413 | "por %%xmm2,%%xmm1 \n" |
michael@0 | 414 | "pand %%xmm4,%%xmm0 \n" |
michael@0 | 415 | "pmulhuw %%xmm6,%%xmm0 \n" |
michael@0 | 416 | "por %%xmm7,%%xmm0 \n" |
michael@0 | 417 | "movdqa %%xmm1,%%xmm2 \n" |
michael@0 | 418 | "punpcklbw %%xmm0,%%xmm1 \n" |
michael@0 | 419 | "punpckhbw %%xmm0,%%xmm2 \n" |
michael@0 | 420 | BUNDLEALIGN |
michael@0 | 421 | MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) |
michael@0 | 422 | MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) |
michael@0 | 423 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 424 | "sub $0x8,%2 \n" |
michael@0 | 425 | "jg 1b \n" |
michael@0 | 426 | : "+r"(src), // %0 |
michael@0 | 427 | "+r"(dst), // %1 |
michael@0 | 428 | "+r"(pix) // %2 |
michael@0 | 429 | : |
michael@0 | 430 | : "memory", "cc", "eax" |
michael@0 | 431 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 432 | , "r14" |
michael@0 | 433 | #endif |
michael@0 | 434 | #if defined(__SSE2__) |
michael@0 | 435 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
michael@0 | 436 | #endif |
michael@0 | 437 | ); |
michael@0 | 438 | } |
michael@0 | 439 | |
michael@0 | 440 | void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { |
michael@0 | 441 | asm volatile ( |
michael@0 | 442 | "mov $0x1080108,%%eax \n" |
michael@0 | 443 | "movd %%eax,%%xmm5 \n" |
michael@0 | 444 | "pshufd $0x0,%%xmm5,%%xmm5 \n" |
michael@0 | 445 | "mov $0x42004200,%%eax \n" |
michael@0 | 446 | "movd %%eax,%%xmm6 \n" |
michael@0 | 447 | "pshufd $0x0,%%xmm6,%%xmm6 \n" |
michael@0 | 448 | "pcmpeqb %%xmm3,%%xmm3 \n" |
michael@0 | 449 | "psllw $0xb,%%xmm3 \n" |
michael@0 | 450 | "movdqa %%xmm3,%%xmm4 \n" |
michael@0 | 451 | "psrlw $0x6,%%xmm4 \n" |
michael@0 | 452 | "pcmpeqb %%xmm7,%%xmm7 \n" |
michael@0 | 453 | "psllw $0x8,%%xmm7 \n" |
michael@0 | 454 | "sub %0,%1 \n" |
michael@0 | 455 | "sub %0,%1 \n" |
michael@0 | 456 | LABELALIGN |
michael@0 | 457 | "1: \n" |
michael@0 | 458 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 459 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 460 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 461 | "psllw $0x1,%%xmm1 \n" |
michael@0 | 462 | "psllw $0xb,%%xmm2 \n" |
michael@0 | 463 | "pand %%xmm3,%%xmm1 \n" |
michael@0 | 464 | "pmulhuw %%xmm5,%%xmm2 \n" |
michael@0 | 465 | "pmulhuw %%xmm5,%%xmm1 \n" |
michael@0 | 466 | "psllw $0x8,%%xmm1 \n" |
michael@0 | 467 | "por %%xmm2,%%xmm1 \n" |
michael@0 | 468 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 469 | "pand %%xmm4,%%xmm0 \n" |
michael@0 | 470 | "psraw $0x8,%%xmm2 \n" |
michael@0 | 471 | "pmulhuw %%xmm6,%%xmm0 \n" |
michael@0 | 472 | "pand %%xmm7,%%xmm2 \n" |
michael@0 | 473 | "por %%xmm2,%%xmm0 \n" |
michael@0 | 474 | "movdqa %%xmm1,%%xmm2 \n" |
michael@0 | 475 | "punpcklbw %%xmm0,%%xmm1 \n" |
michael@0 | 476 | "punpckhbw %%xmm0,%%xmm2 \n" |
michael@0 | 477 | BUNDLEALIGN |
michael@0 | 478 | MEMOPMEM(movdqa,xmm1,0x00,1,0,2) // movdqa %%xmm1,(%1,%0,2) |
michael@0 | 479 | MEMOPMEM(movdqa,xmm2,0x10,1,0,2) // movdqa %%xmm2,0x10(%1,%0,2) |
michael@0 | 480 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 481 | "sub $0x8,%2 \n" |
michael@0 | 482 | "jg 1b \n" |
michael@0 | 483 | : "+r"(src), // %0 |
michael@0 | 484 | "+r"(dst), // %1 |
michael@0 | 485 | "+r"(pix) // %2 |
michael@0 | 486 | : |
michael@0 | 487 | : "memory", "cc", "eax" |
michael@0 | 488 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 489 | , "r14" |
michael@0 | 490 | #endif |
michael@0 | 491 | #if defined(__SSE2__) |
michael@0 | 492 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
michael@0 | 493 | #endif |
michael@0 | 494 | ); |
michael@0 | 495 | } |
michael@0 | 496 | |
michael@0 | 497 | void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) { |
michael@0 | 498 | asm volatile ( |
michael@0 | 499 | "mov $0xf0f0f0f,%%eax \n" |
michael@0 | 500 | "movd %%eax,%%xmm4 \n" |
michael@0 | 501 | "pshufd $0x0,%%xmm4,%%xmm4 \n" |
michael@0 | 502 | "movdqa %%xmm4,%%xmm5 \n" |
michael@0 | 503 | "pslld $0x4,%%xmm5 \n" |
michael@0 | 504 | "sub %0,%1 \n" |
michael@0 | 505 | "sub %0,%1 \n" |
michael@0 | 506 | LABELALIGN |
michael@0 | 507 | "1: \n" |
michael@0 | 508 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 509 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 510 | "pand %%xmm4,%%xmm0 \n" |
michael@0 | 511 | "pand %%xmm5,%%xmm2 \n" |
michael@0 | 512 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 513 | "movdqa %%xmm2,%%xmm3 \n" |
michael@0 | 514 | "psllw $0x4,%%xmm1 \n" |
michael@0 | 515 | "psrlw $0x4,%%xmm3 \n" |
michael@0 | 516 | "por %%xmm1,%%xmm0 \n" |
michael@0 | 517 | "por %%xmm3,%%xmm2 \n" |
michael@0 | 518 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 519 | "punpcklbw %%xmm2,%%xmm0 \n" |
michael@0 | 520 | "punpckhbw %%xmm2,%%xmm1 \n" |
michael@0 | 521 | BUNDLEALIGN |
michael@0 | 522 | MEMOPMEM(movdqa,xmm0,0x00,1,0,2) // movdqa %%xmm0,(%1,%0,2) |
michael@0 | 523 | MEMOPMEM(movdqa,xmm1,0x10,1,0,2) // movdqa %%xmm1,0x10(%1,%0,2) |
michael@0 | 524 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 525 | "sub $0x8,%2 \n" |
michael@0 | 526 | "jg 1b \n" |
michael@0 | 527 | : "+r"(src), // %0 |
michael@0 | 528 | "+r"(dst), // %1 |
michael@0 | 529 | "+r"(pix) // %2 |
michael@0 | 530 | : |
michael@0 | 531 | : "memory", "cc", "eax" |
michael@0 | 532 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 533 | , "r14" |
michael@0 | 534 | #endif |
michael@0 | 535 | #if defined(__SSE2__) |
michael@0 | 536 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 537 | #endif |
michael@0 | 538 | ); |
michael@0 | 539 | } |
michael@0 | 540 | |
michael@0 | 541 | void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) { |
michael@0 | 542 | asm volatile ( |
michael@0 | 543 | "movdqa %3,%%xmm6 \n" |
michael@0 | 544 | LABELALIGN |
michael@0 | 545 | "1: \n" |
michael@0 | 546 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 547 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 548 | "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 549 | "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
michael@0 | 550 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 551 | "pshufb %%xmm6,%%xmm0 \n" |
michael@0 | 552 | "pshufb %%xmm6,%%xmm1 \n" |
michael@0 | 553 | "pshufb %%xmm6,%%xmm2 \n" |
michael@0 | 554 | "pshufb %%xmm6,%%xmm3 \n" |
michael@0 | 555 | "movdqa %%xmm1,%%xmm4 \n" |
michael@0 | 556 | "psrldq $0x4,%%xmm1 \n" |
michael@0 | 557 | "pslldq $0xc,%%xmm4 \n" |
michael@0 | 558 | "movdqa %%xmm2,%%xmm5 \n" |
michael@0 | 559 | "por %%xmm4,%%xmm0 \n" |
michael@0 | 560 | "pslldq $0x8,%%xmm5 \n" |
michael@0 | 561 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 562 | "por %%xmm5,%%xmm1 \n" |
michael@0 | 563 | "psrldq $0x8,%%xmm2 \n" |
michael@0 | 564 | "pslldq $0x4,%%xmm3 \n" |
michael@0 | 565 | "por %%xmm3,%%xmm2 \n" |
michael@0 | 566 | "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
michael@0 | 567 | "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" |
michael@0 | 568 | "lea " MEMLEA(0x30,1) ",%1 \n" |
michael@0 | 569 | "sub $0x10,%2 \n" |
michael@0 | 570 | "jg 1b \n" |
michael@0 | 571 | : "+r"(src), // %0 |
michael@0 | 572 | "+r"(dst), // %1 |
michael@0 | 573 | "+r"(pix) // %2 |
michael@0 | 574 | : "m"(kShuffleMaskARGBToRGB24) // %3 |
michael@0 | 575 | : "memory", "cc" |
michael@0 | 576 | #if defined(__SSE2__) |
michael@0 | 577 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
michael@0 | 578 | #endif |
michael@0 | 579 | ); |
michael@0 | 580 | } |
michael@0 | 581 | |
michael@0 | 582 | void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) { |
michael@0 | 583 | asm volatile ( |
michael@0 | 584 | "movdqa %3,%%xmm6 \n" |
michael@0 | 585 | LABELALIGN |
michael@0 | 586 | "1: \n" |
michael@0 | 587 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 588 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 589 | "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 590 | "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
michael@0 | 591 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 592 | "pshufb %%xmm6,%%xmm0 \n" |
michael@0 | 593 | "pshufb %%xmm6,%%xmm1 \n" |
michael@0 | 594 | "pshufb %%xmm6,%%xmm2 \n" |
michael@0 | 595 | "pshufb %%xmm6,%%xmm3 \n" |
michael@0 | 596 | "movdqa %%xmm1,%%xmm4 \n" |
michael@0 | 597 | "psrldq $0x4,%%xmm1 \n" |
michael@0 | 598 | "pslldq $0xc,%%xmm4 \n" |
michael@0 | 599 | "movdqa %%xmm2,%%xmm5 \n" |
michael@0 | 600 | "por %%xmm4,%%xmm0 \n" |
michael@0 | 601 | "pslldq $0x8,%%xmm5 \n" |
michael@0 | 602 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 603 | "por %%xmm5,%%xmm1 \n" |
michael@0 | 604 | "psrldq $0x8,%%xmm2 \n" |
michael@0 | 605 | "pslldq $0x4,%%xmm3 \n" |
michael@0 | 606 | "por %%xmm3,%%xmm2 \n" |
michael@0 | 607 | "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
michael@0 | 608 | "movdqu %%xmm2," MEMACCESS2(0x20,1) " \n" |
michael@0 | 609 | "lea " MEMLEA(0x30,1) ",%1 \n" |
michael@0 | 610 | "sub $0x10,%2 \n" |
michael@0 | 611 | "jg 1b \n" |
michael@0 | 612 | : "+r"(src), // %0 |
michael@0 | 613 | "+r"(dst), // %1 |
michael@0 | 614 | "+r"(pix) // %2 |
michael@0 | 615 | : "m"(kShuffleMaskARGBToRAW) // %3 |
michael@0 | 616 | : "memory", "cc" |
michael@0 | 617 | #if defined(__SSE2__) |
michael@0 | 618 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
michael@0 | 619 | #endif |
michael@0 | 620 | ); |
michael@0 | 621 | } |
michael@0 | 622 | |
michael@0 | 623 | void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) { |
michael@0 | 624 | asm volatile ( |
michael@0 | 625 | "pcmpeqb %%xmm3,%%xmm3 \n" |
michael@0 | 626 | "psrld $0x1b,%%xmm3 \n" |
michael@0 | 627 | "pcmpeqb %%xmm4,%%xmm4 \n" |
michael@0 | 628 | "psrld $0x1a,%%xmm4 \n" |
michael@0 | 629 | "pslld $0x5,%%xmm4 \n" |
michael@0 | 630 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 631 | "pslld $0xb,%%xmm5 \n" |
michael@0 | 632 | LABELALIGN |
michael@0 | 633 | "1: \n" |
michael@0 | 634 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 635 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 636 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 637 | "pslld $0x8,%%xmm0 \n" |
michael@0 | 638 | "psrld $0x3,%%xmm1 \n" |
michael@0 | 639 | "psrld $0x5,%%xmm2 \n" |
michael@0 | 640 | "psrad $0x10,%%xmm0 \n" |
michael@0 | 641 | "pand %%xmm3,%%xmm1 \n" |
michael@0 | 642 | "pand %%xmm4,%%xmm2 \n" |
michael@0 | 643 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 644 | "por %%xmm2,%%xmm1 \n" |
michael@0 | 645 | "por %%xmm1,%%xmm0 \n" |
michael@0 | 646 | "packssdw %%xmm0,%%xmm0 \n" |
michael@0 | 647 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 648 | "movq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 649 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 650 | "sub $0x4,%2 \n" |
michael@0 | 651 | "jg 1b \n" |
michael@0 | 652 | : "+r"(src), // %0 |
michael@0 | 653 | "+r"(dst), // %1 |
michael@0 | 654 | "+r"(pix) // %2 |
michael@0 | 655 | : |
michael@0 | 656 | : "memory", "cc" |
michael@0 | 657 | #if defined(__SSE2__) |
michael@0 | 658 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 659 | #endif |
michael@0 | 660 | ); |
michael@0 | 661 | } |
michael@0 | 662 | |
michael@0 | 663 | void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) { |
michael@0 | 664 | asm volatile ( |
michael@0 | 665 | "pcmpeqb %%xmm4,%%xmm4 \n" |
michael@0 | 666 | "psrld $0x1b,%%xmm4 \n" |
michael@0 | 667 | "movdqa %%xmm4,%%xmm5 \n" |
michael@0 | 668 | "pslld $0x5,%%xmm5 \n" |
michael@0 | 669 | "movdqa %%xmm4,%%xmm6 \n" |
michael@0 | 670 | "pslld $0xa,%%xmm6 \n" |
michael@0 | 671 | "pcmpeqb %%xmm7,%%xmm7 \n" |
michael@0 | 672 | "pslld $0xf,%%xmm7 \n" |
michael@0 | 673 | LABELALIGN |
michael@0 | 674 | "1: \n" |
michael@0 | 675 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 676 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 677 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 678 | "movdqa %%xmm0,%%xmm3 \n" |
michael@0 | 679 | "psrad $0x10,%%xmm0 \n" |
michael@0 | 680 | "psrld $0x3,%%xmm1 \n" |
michael@0 | 681 | "psrld $0x6,%%xmm2 \n" |
michael@0 | 682 | "psrld $0x9,%%xmm3 \n" |
michael@0 | 683 | "pand %%xmm7,%%xmm0 \n" |
michael@0 | 684 | "pand %%xmm4,%%xmm1 \n" |
michael@0 | 685 | "pand %%xmm5,%%xmm2 \n" |
michael@0 | 686 | "pand %%xmm6,%%xmm3 \n" |
michael@0 | 687 | "por %%xmm1,%%xmm0 \n" |
michael@0 | 688 | "por %%xmm3,%%xmm2 \n" |
michael@0 | 689 | "por %%xmm2,%%xmm0 \n" |
michael@0 | 690 | "packssdw %%xmm0,%%xmm0 \n" |
michael@0 | 691 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 692 | "movq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 693 | "lea " MEMACCESS2(0x8,1) ",%1 \n" |
michael@0 | 694 | "sub $0x4,%2 \n" |
michael@0 | 695 | "jg 1b \n" |
michael@0 | 696 | : "+r"(src), // %0 |
michael@0 | 697 | "+r"(dst), // %1 |
michael@0 | 698 | "+r"(pix) // %2 |
michael@0 | 699 | : |
michael@0 | 700 | : "memory", "cc" |
michael@0 | 701 | #if defined(__SSE2__) |
michael@0 | 702 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
michael@0 | 703 | #endif |
michael@0 | 704 | ); |
michael@0 | 705 | } |
michael@0 | 706 | |
michael@0 | 707 | void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { |
michael@0 | 708 | asm volatile ( |
michael@0 | 709 | "pcmpeqb %%xmm4,%%xmm4 \n" |
michael@0 | 710 | "psllw $0xc,%%xmm4 \n" |
michael@0 | 711 | "movdqa %%xmm4,%%xmm3 \n" |
michael@0 | 712 | "psrlw $0x8,%%xmm3 \n" |
michael@0 | 713 | LABELALIGN |
michael@0 | 714 | "1: \n" |
michael@0 | 715 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 716 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 717 | "pand %%xmm3,%%xmm0 \n" |
michael@0 | 718 | "pand %%xmm4,%%xmm1 \n" |
michael@0 | 719 | "psrlq $0x4,%%xmm0 \n" |
michael@0 | 720 | "psrlq $0x8,%%xmm1 \n" |
michael@0 | 721 | "por %%xmm1,%%xmm0 \n" |
michael@0 | 722 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 723 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 724 | "movq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 725 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 726 | "sub $0x4,%2 \n" |
michael@0 | 727 | "jg 1b \n" |
michael@0 | 728 | : "+r"(src), // %0 |
michael@0 | 729 | "+r"(dst), // %1 |
michael@0 | 730 | "+r"(pix) // %2 |
michael@0 | 731 | : |
michael@0 | 732 | : "memory", "cc" |
michael@0 | 733 | #if defined(__SSE2__) |
michael@0 | 734 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
michael@0 | 735 | #endif |
michael@0 | 736 | ); |
michael@0 | 737 | } |
michael@0 | 738 | #endif // HAS_RGB24TOARGBROW_SSSE3 |
michael@0 | 739 | |
michael@0 | 740 | #ifdef HAS_ARGBTOYROW_SSSE3 |
michael@0 | 741 | void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
michael@0 | 742 | asm volatile ( |
michael@0 | 743 | "movdqa %4,%%xmm5 \n" |
michael@0 | 744 | "movdqa %3,%%xmm4 \n" |
michael@0 | 745 | LABELALIGN |
michael@0 | 746 | "1: \n" |
michael@0 | 747 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 748 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 749 | "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 750 | "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
michael@0 | 751 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 752 | "pmaddubsw %%xmm4,%%xmm1 \n" |
michael@0 | 753 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 754 | "pmaddubsw %%xmm4,%%xmm3 \n" |
michael@0 | 755 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 756 | "phaddw %%xmm1,%%xmm0 \n" |
michael@0 | 757 | "phaddw %%xmm3,%%xmm2 \n" |
michael@0 | 758 | "psrlw $0x7,%%xmm0 \n" |
michael@0 | 759 | "psrlw $0x7,%%xmm2 \n" |
michael@0 | 760 | "packuswb %%xmm2,%%xmm0 \n" |
michael@0 | 761 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 762 | "sub $0x10,%2 \n" |
michael@0 | 763 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 764 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 765 | "jg 1b \n" |
michael@0 | 766 | : "+r"(src_argb), // %0 |
michael@0 | 767 | "+r"(dst_y), // %1 |
michael@0 | 768 | "+r"(pix) // %2 |
michael@0 | 769 | : "m"(kARGBToY), // %3 |
michael@0 | 770 | "m"(kAddY16) // %4 |
michael@0 | 771 | : "memory", "cc" |
michael@0 | 772 | #if defined(__SSE2__) |
michael@0 | 773 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 774 | #endif |
michael@0 | 775 | ); |
michael@0 | 776 | } |
michael@0 | 777 | |
michael@0 | 778 | void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
michael@0 | 779 | asm volatile ( |
michael@0 | 780 | "movdqa %4,%%xmm5 \n" |
michael@0 | 781 | "movdqa %3,%%xmm4 \n" |
michael@0 | 782 | LABELALIGN |
michael@0 | 783 | "1: \n" |
michael@0 | 784 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 785 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 786 | "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 787 | "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
michael@0 | 788 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 789 | "pmaddubsw %%xmm4,%%xmm1 \n" |
michael@0 | 790 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 791 | "pmaddubsw %%xmm4,%%xmm3 \n" |
michael@0 | 792 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 793 | "phaddw %%xmm1,%%xmm0 \n" |
michael@0 | 794 | "phaddw %%xmm3,%%xmm2 \n" |
michael@0 | 795 | "psrlw $0x7,%%xmm0 \n" |
michael@0 | 796 | "psrlw $0x7,%%xmm2 \n" |
michael@0 | 797 | "packuswb %%xmm2,%%xmm0 \n" |
michael@0 | 798 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 799 | "sub $0x10,%2 \n" |
michael@0 | 800 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 801 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 802 | "jg 1b \n" |
michael@0 | 803 | : "+r"(src_argb), // %0 |
michael@0 | 804 | "+r"(dst_y), // %1 |
michael@0 | 805 | "+r"(pix) // %2 |
michael@0 | 806 | : "m"(kARGBToY), // %3 |
michael@0 | 807 | "m"(kAddY16) // %4 |
michael@0 | 808 | : "memory", "cc" |
michael@0 | 809 | #if defined(__SSE2__) |
michael@0 | 810 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 811 | #endif |
michael@0 | 812 | ); |
michael@0 | 813 | } |
michael@0 | 814 | #endif // HAS_ARGBTOYROW_SSSE3 |
michael@0 | 815 | |
michael@0 | 816 | #ifdef HAS_ARGBTOYJROW_SSSE3 |
michael@0 | 817 | void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
michael@0 | 818 | asm volatile ( |
michael@0 | 819 | "movdqa %3,%%xmm4 \n" |
michael@0 | 820 | "movdqa %4,%%xmm5 \n" |
michael@0 | 821 | LABELALIGN |
michael@0 | 822 | "1: \n" |
michael@0 | 823 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 824 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 825 | "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 826 | "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
michael@0 | 827 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 828 | "pmaddubsw %%xmm4,%%xmm1 \n" |
michael@0 | 829 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 830 | "pmaddubsw %%xmm4,%%xmm3 \n" |
michael@0 | 831 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 832 | "phaddw %%xmm1,%%xmm0 \n" |
michael@0 | 833 | "phaddw %%xmm3,%%xmm2 \n" |
michael@0 | 834 | "paddw %%xmm5,%%xmm0 \n" |
michael@0 | 835 | "paddw %%xmm5,%%xmm2 \n" |
michael@0 | 836 | "psrlw $0x7,%%xmm0 \n" |
michael@0 | 837 | "psrlw $0x7,%%xmm2 \n" |
michael@0 | 838 | "packuswb %%xmm2,%%xmm0 \n" |
michael@0 | 839 | "sub $0x10,%2 \n" |
michael@0 | 840 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 841 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 842 | "jg 1b \n" |
michael@0 | 843 | : "+r"(src_argb), // %0 |
michael@0 | 844 | "+r"(dst_y), // %1 |
michael@0 | 845 | "+r"(pix) // %2 |
michael@0 | 846 | : "m"(kARGBToYJ), // %3 |
michael@0 | 847 | "m"(kAddYJ64) // %4 |
michael@0 | 848 | : "memory", "cc" |
michael@0 | 849 | #if defined(__SSE2__) |
michael@0 | 850 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 851 | #endif |
michael@0 | 852 | ); |
michael@0 | 853 | } |
michael@0 | 854 | |
michael@0 | 855 | void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { |
michael@0 | 856 | asm volatile ( |
michael@0 | 857 | "movdqa %3,%%xmm4 \n" |
michael@0 | 858 | "movdqa %4,%%xmm5 \n" |
michael@0 | 859 | LABELALIGN |
michael@0 | 860 | "1: \n" |
michael@0 | 861 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 862 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 863 | "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 864 | "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
michael@0 | 865 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 866 | "pmaddubsw %%xmm4,%%xmm1 \n" |
michael@0 | 867 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 868 | "pmaddubsw %%xmm4,%%xmm3 \n" |
michael@0 | 869 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 870 | "phaddw %%xmm1,%%xmm0 \n" |
michael@0 | 871 | "phaddw %%xmm3,%%xmm2 \n" |
michael@0 | 872 | "paddw %%xmm5,%%xmm0 \n" |
michael@0 | 873 | "paddw %%xmm5,%%xmm2 \n" |
michael@0 | 874 | "psrlw $0x7,%%xmm0 \n" |
michael@0 | 875 | "psrlw $0x7,%%xmm2 \n" |
michael@0 | 876 | "packuswb %%xmm2,%%xmm0 \n" |
michael@0 | 877 | "sub $0x10,%2 \n" |
michael@0 | 878 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 879 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 880 | "jg 1b \n" |
michael@0 | 881 | : "+r"(src_argb), // %0 |
michael@0 | 882 | "+r"(dst_y), // %1 |
michael@0 | 883 | "+r"(pix) // %2 |
michael@0 | 884 | : "m"(kARGBToYJ), // %3 |
michael@0 | 885 | "m"(kAddYJ64) // %4 |
michael@0 | 886 | : "memory", "cc" |
michael@0 | 887 | #if defined(__SSE2__) |
michael@0 | 888 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 889 | #endif |
michael@0 | 890 | ); |
michael@0 | 891 | } |
michael@0 | 892 | #endif // HAS_ARGBTOYJROW_SSSE3 |
michael@0 | 893 | |
michael@0 | 894 | #ifdef HAS_ARGBTOUVROW_SSSE3 |
michael@0 | 895 | // TODO(fbarchard): pass xmm constants to single block of assembly. |
michael@0 | 896 | // fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes |
michael@0 | 897 | // 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers, |
michael@0 | 898 | // or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around |
michael@0 | 899 | // and considered unsafe. |
michael@0 | 900 | void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
michael@0 | 901 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 902 | asm volatile ( |
michael@0 | 903 | "movdqa %0,%%xmm4 \n" |
michael@0 | 904 | "movdqa %1,%%xmm3 \n" |
michael@0 | 905 | "movdqa %2,%%xmm5 \n" |
michael@0 | 906 | : |
michael@0 | 907 | : "m"(kARGBToU), // %0 |
michael@0 | 908 | "m"(kARGBToV), // %1 |
michael@0 | 909 | "m"(kAddUV128) // %2 |
michael@0 | 910 | ); |
michael@0 | 911 | asm volatile ( |
michael@0 | 912 | "sub %1,%2 \n" |
michael@0 | 913 | LABELALIGN |
michael@0 | 914 | "1: \n" |
michael@0 | 915 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 916 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 917 | "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 918 | "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
michael@0 | 919 | BUNDLEALIGN |
michael@0 | 920 | MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 |
michael@0 | 921 | MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 |
michael@0 | 922 | MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 |
michael@0 | 923 | MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 |
michael@0 | 924 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 925 | "movdqa %%xmm0,%%xmm7 \n" |
michael@0 | 926 | "shufps $0x88,%%xmm1,%%xmm0 \n" |
michael@0 | 927 | "shufps $0xdd,%%xmm1,%%xmm7 \n" |
michael@0 | 928 | "pavgb %%xmm7,%%xmm0 \n" |
michael@0 | 929 | "movdqa %%xmm2,%%xmm7 \n" |
michael@0 | 930 | "shufps $0x88,%%xmm6,%%xmm2 \n" |
michael@0 | 931 | "shufps $0xdd,%%xmm6,%%xmm7 \n" |
michael@0 | 932 | "pavgb %%xmm7,%%xmm2 \n" |
michael@0 | 933 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 934 | "movdqa %%xmm2,%%xmm6 \n" |
michael@0 | 935 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 936 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 937 | "pmaddubsw %%xmm3,%%xmm1 \n" |
michael@0 | 938 | "pmaddubsw %%xmm3,%%xmm6 \n" |
michael@0 | 939 | "phaddw %%xmm2,%%xmm0 \n" |
michael@0 | 940 | "phaddw %%xmm6,%%xmm1 \n" |
michael@0 | 941 | "psraw $0x8,%%xmm0 \n" |
michael@0 | 942 | "psraw $0x8,%%xmm1 \n" |
michael@0 | 943 | "packsswb %%xmm1,%%xmm0 \n" |
michael@0 | 944 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 945 | "sub $0x10,%3 \n" |
michael@0 | 946 | "movlps %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 947 | BUNDLEALIGN |
michael@0 | 948 | MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
michael@0 | 949 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 950 | "jg 1b \n" |
michael@0 | 951 | : "+r"(src_argb0), // %0 |
michael@0 | 952 | "+r"(dst_u), // %1 |
michael@0 | 953 | "+r"(dst_v), // %2 |
michael@0 | 954 | "+rm"(width) // %3 |
michael@0 | 955 | : "r"((intptr_t)(src_stride_argb)) // %4 |
michael@0 | 956 | : "memory", "cc" |
michael@0 | 957 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 958 | , "r14" |
michael@0 | 959 | #endif |
michael@0 | 960 | #if defined(__SSE2__) |
michael@0 | 961 | , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
michael@0 | 962 | #endif |
michael@0 | 963 | ); |
michael@0 | 964 | } |
michael@0 | 965 | |
michael@0 | 966 | // TODO(fbarchard): Share code with ARGBToUVRow_SSSE3. |
michael@0 | 967 | void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
michael@0 | 968 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 969 | asm volatile ( |
michael@0 | 970 | "movdqa %0,%%xmm4 \n" |
michael@0 | 971 | "movdqa %1,%%xmm3 \n" |
michael@0 | 972 | "movdqa %2,%%xmm5 \n" |
michael@0 | 973 | : |
michael@0 | 974 | : "m"(kARGBToUJ), // %0 |
michael@0 | 975 | "m"(kARGBToVJ), // %1 |
michael@0 | 976 | "m"(kAddUVJ128) // %2 |
michael@0 | 977 | ); |
michael@0 | 978 | asm volatile ( |
michael@0 | 979 | "sub %1,%2 \n" |
michael@0 | 980 | LABELALIGN |
michael@0 | 981 | "1: \n" |
michael@0 | 982 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 983 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 984 | "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 985 | "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
michael@0 | 986 | BUNDLEALIGN |
michael@0 | 987 | MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 |
michael@0 | 988 | MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 |
michael@0 | 989 | MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 |
michael@0 | 990 | MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 |
michael@0 | 991 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 992 | "movdqa %%xmm0,%%xmm7 \n" |
michael@0 | 993 | "shufps $0x88,%%xmm1,%%xmm0 \n" |
michael@0 | 994 | "shufps $0xdd,%%xmm1,%%xmm7 \n" |
michael@0 | 995 | "pavgb %%xmm7,%%xmm0 \n" |
michael@0 | 996 | "movdqa %%xmm2,%%xmm7 \n" |
michael@0 | 997 | "shufps $0x88,%%xmm6,%%xmm2 \n" |
michael@0 | 998 | "shufps $0xdd,%%xmm6,%%xmm7 \n" |
michael@0 | 999 | "pavgb %%xmm7,%%xmm2 \n" |
michael@0 | 1000 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 1001 | "movdqa %%xmm2,%%xmm6 \n" |
michael@0 | 1002 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 1003 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 1004 | "pmaddubsw %%xmm3,%%xmm1 \n" |
michael@0 | 1005 | "pmaddubsw %%xmm3,%%xmm6 \n" |
michael@0 | 1006 | "phaddw %%xmm2,%%xmm0 \n" |
michael@0 | 1007 | "phaddw %%xmm6,%%xmm1 \n" |
michael@0 | 1008 | "paddw %%xmm5,%%xmm0 \n" |
michael@0 | 1009 | "paddw %%xmm5,%%xmm1 \n" |
michael@0 | 1010 | "psraw $0x8,%%xmm0 \n" |
michael@0 | 1011 | "psraw $0x8,%%xmm1 \n" |
michael@0 | 1012 | "packsswb %%xmm1,%%xmm0 \n" |
michael@0 | 1013 | "sub $0x10,%3 \n" |
michael@0 | 1014 | "movlps %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 1015 | BUNDLEALIGN |
michael@0 | 1016 | MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
michael@0 | 1017 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 1018 | "jg 1b \n" |
michael@0 | 1019 | : "+r"(src_argb0), // %0 |
michael@0 | 1020 | "+r"(dst_u), // %1 |
michael@0 | 1021 | "+r"(dst_v), // %2 |
michael@0 | 1022 | "+rm"(width) // %3 |
michael@0 | 1023 | : "r"((intptr_t)(src_stride_argb)) // %4 |
michael@0 | 1024 | : "memory", "cc" |
michael@0 | 1025 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 1026 | , "r14" |
michael@0 | 1027 | #endif |
michael@0 | 1028 | #if defined(__SSE2__) |
michael@0 | 1029 | , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
michael@0 | 1030 | #endif |
michael@0 | 1031 | ); |
michael@0 | 1032 | } |
michael@0 | 1033 | |
michael@0 | 1034 | void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
michael@0 | 1035 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1036 | asm volatile ( |
michael@0 | 1037 | "movdqa %0,%%xmm4 \n" |
michael@0 | 1038 | "movdqa %1,%%xmm3 \n" |
michael@0 | 1039 | "movdqa %2,%%xmm5 \n" |
michael@0 | 1040 | : |
michael@0 | 1041 | : "m"(kARGBToU), // %0 |
michael@0 | 1042 | "m"(kARGBToV), // %1 |
michael@0 | 1043 | "m"(kAddUV128) // %2 |
michael@0 | 1044 | ); |
michael@0 | 1045 | asm volatile ( |
michael@0 | 1046 | "sub %1,%2 \n" |
michael@0 | 1047 | LABELALIGN |
michael@0 | 1048 | "1: \n" |
michael@0 | 1049 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1050 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1051 | "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1052 | "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
michael@0 | 1053 | BUNDLEALIGN |
michael@0 | 1054 | MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
michael@0 | 1055 | "pavgb %%xmm7,%%xmm0 \n" |
michael@0 | 1056 | MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 |
michael@0 | 1057 | "pavgb %%xmm7,%%xmm1 \n" |
michael@0 | 1058 | MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 |
michael@0 | 1059 | "pavgb %%xmm7,%%xmm2 \n" |
michael@0 | 1060 | MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 |
michael@0 | 1061 | "pavgb %%xmm7,%%xmm6 \n" |
michael@0 | 1062 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 1063 | "movdqa %%xmm0,%%xmm7 \n" |
michael@0 | 1064 | "shufps $0x88,%%xmm1,%%xmm0 \n" |
michael@0 | 1065 | "shufps $0xdd,%%xmm1,%%xmm7 \n" |
michael@0 | 1066 | "pavgb %%xmm7,%%xmm0 \n" |
michael@0 | 1067 | "movdqa %%xmm2,%%xmm7 \n" |
michael@0 | 1068 | "shufps $0x88,%%xmm6,%%xmm2 \n" |
michael@0 | 1069 | "shufps $0xdd,%%xmm6,%%xmm7 \n" |
michael@0 | 1070 | "pavgb %%xmm7,%%xmm2 \n" |
michael@0 | 1071 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 1072 | "movdqa %%xmm2,%%xmm6 \n" |
michael@0 | 1073 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 1074 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 1075 | "pmaddubsw %%xmm3,%%xmm1 \n" |
michael@0 | 1076 | "pmaddubsw %%xmm3,%%xmm6 \n" |
michael@0 | 1077 | "phaddw %%xmm2,%%xmm0 \n" |
michael@0 | 1078 | "phaddw %%xmm6,%%xmm1 \n" |
michael@0 | 1079 | "psraw $0x8,%%xmm0 \n" |
michael@0 | 1080 | "psraw $0x8,%%xmm1 \n" |
michael@0 | 1081 | "packsswb %%xmm1,%%xmm0 \n" |
michael@0 | 1082 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 1083 | "sub $0x10,%3 \n" |
michael@0 | 1084 | "movlps %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 1085 | BUNDLEALIGN |
michael@0 | 1086 | MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
michael@0 | 1087 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 1088 | "jg 1b \n" |
michael@0 | 1089 | : "+r"(src_argb0), // %0 |
michael@0 | 1090 | "+r"(dst_u), // %1 |
michael@0 | 1091 | "+r"(dst_v), // %2 |
michael@0 | 1092 | "+rm"(width) // %3 |
michael@0 | 1093 | : "r"((intptr_t)(src_stride_argb)) // %4 |
michael@0 | 1094 | : "memory", "cc" |
michael@0 | 1095 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 1096 | , "r14" |
michael@0 | 1097 | #endif |
michael@0 | 1098 | #if defined(__SSE2__) |
michael@0 | 1099 | , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
michael@0 | 1100 | #endif |
michael@0 | 1101 | ); |
michael@0 | 1102 | } |
michael@0 | 1103 | |
michael@0 | 1104 | void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, |
michael@0 | 1105 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1106 | asm volatile ( |
michael@0 | 1107 | "movdqa %0,%%xmm4 \n" |
michael@0 | 1108 | "movdqa %1,%%xmm3 \n" |
michael@0 | 1109 | "movdqa %2,%%xmm5 \n" |
michael@0 | 1110 | : |
michael@0 | 1111 | : "m"(kARGBToUJ), // %0 |
michael@0 | 1112 | "m"(kARGBToVJ), // %1 |
michael@0 | 1113 | "m"(kAddUVJ128) // %2 |
michael@0 | 1114 | ); |
michael@0 | 1115 | asm volatile ( |
michael@0 | 1116 | "sub %1,%2 \n" |
michael@0 | 1117 | LABELALIGN |
michael@0 | 1118 | "1: \n" |
michael@0 | 1119 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1120 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1121 | "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1122 | "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
michael@0 | 1123 | BUNDLEALIGN |
michael@0 | 1124 | MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
michael@0 | 1125 | "pavgb %%xmm7,%%xmm0 \n" |
michael@0 | 1126 | MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 |
michael@0 | 1127 | "pavgb %%xmm7,%%xmm1 \n" |
michael@0 | 1128 | MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 |
michael@0 | 1129 | "pavgb %%xmm7,%%xmm2 \n" |
michael@0 | 1130 | MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 |
michael@0 | 1131 | "pavgb %%xmm7,%%xmm6 \n" |
michael@0 | 1132 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 1133 | "movdqa %%xmm0,%%xmm7 \n" |
michael@0 | 1134 | "shufps $0x88,%%xmm1,%%xmm0 \n" |
michael@0 | 1135 | "shufps $0xdd,%%xmm1,%%xmm7 \n" |
michael@0 | 1136 | "pavgb %%xmm7,%%xmm0 \n" |
michael@0 | 1137 | "movdqa %%xmm2,%%xmm7 \n" |
michael@0 | 1138 | "shufps $0x88,%%xmm6,%%xmm2 \n" |
michael@0 | 1139 | "shufps $0xdd,%%xmm6,%%xmm7 \n" |
michael@0 | 1140 | "pavgb %%xmm7,%%xmm2 \n" |
michael@0 | 1141 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 1142 | "movdqa %%xmm2,%%xmm6 \n" |
michael@0 | 1143 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 1144 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 1145 | "pmaddubsw %%xmm3,%%xmm1 \n" |
michael@0 | 1146 | "pmaddubsw %%xmm3,%%xmm6 \n" |
michael@0 | 1147 | "phaddw %%xmm2,%%xmm0 \n" |
michael@0 | 1148 | "phaddw %%xmm6,%%xmm1 \n" |
michael@0 | 1149 | "paddw %%xmm5,%%xmm0 \n" |
michael@0 | 1150 | "paddw %%xmm5,%%xmm1 \n" |
michael@0 | 1151 | "psraw $0x8,%%xmm0 \n" |
michael@0 | 1152 | "psraw $0x8,%%xmm1 \n" |
michael@0 | 1153 | "packsswb %%xmm1,%%xmm0 \n" |
michael@0 | 1154 | "sub $0x10,%3 \n" |
michael@0 | 1155 | "movlps %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 1156 | BUNDLEALIGN |
michael@0 | 1157 | MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
michael@0 | 1158 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 1159 | "jg 1b \n" |
michael@0 | 1160 | : "+r"(src_argb0), // %0 |
michael@0 | 1161 | "+r"(dst_u), // %1 |
michael@0 | 1162 | "+r"(dst_v), // %2 |
michael@0 | 1163 | "+rm"(width) // %3 |
michael@0 | 1164 | : "r"((intptr_t)(src_stride_argb)) |
michael@0 | 1165 | : "memory", "cc" |
michael@0 | 1166 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 1167 | , "r14" |
michael@0 | 1168 | #endif |
michael@0 | 1169 | #if defined(__SSE2__) |
michael@0 | 1170 | , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
michael@0 | 1171 | #endif |
michael@0 | 1172 | ); |
michael@0 | 1173 | } |
michael@0 | 1174 | |
michael@0 | 1175 | void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, |
michael@0 | 1176 | int width) { |
michael@0 | 1177 | asm volatile ( |
michael@0 | 1178 | "movdqa %0,%%xmm4 \n" |
michael@0 | 1179 | "movdqa %1,%%xmm3 \n" |
michael@0 | 1180 | "movdqa %2,%%xmm5 \n" |
michael@0 | 1181 | : |
michael@0 | 1182 | : "m"(kARGBToU), // %0 |
michael@0 | 1183 | "m"(kARGBToV), // %1 |
michael@0 | 1184 | "m"(kAddUV128) // %2 |
michael@0 | 1185 | ); |
michael@0 | 1186 | asm volatile ( |
michael@0 | 1187 | "sub %1,%2 \n" |
michael@0 | 1188 | LABELALIGN |
michael@0 | 1189 | "1: \n" |
michael@0 | 1190 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1191 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1192 | "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1193 | "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
michael@0 | 1194 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 1195 | "pmaddubsw %%xmm4,%%xmm1 \n" |
michael@0 | 1196 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 1197 | "pmaddubsw %%xmm4,%%xmm6 \n" |
michael@0 | 1198 | "phaddw %%xmm1,%%xmm0 \n" |
michael@0 | 1199 | "phaddw %%xmm6,%%xmm2 \n" |
michael@0 | 1200 | "psraw $0x8,%%xmm0 \n" |
michael@0 | 1201 | "psraw $0x8,%%xmm2 \n" |
michael@0 | 1202 | "packsswb %%xmm2,%%xmm0 \n" |
michael@0 | 1203 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 1204 | "sub $0x10,%3 \n" |
michael@0 | 1205 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 1206 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1207 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1208 | "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1209 | "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
michael@0 | 1210 | "pmaddubsw %%xmm3,%%xmm0 \n" |
michael@0 | 1211 | "pmaddubsw %%xmm3,%%xmm1 \n" |
michael@0 | 1212 | "pmaddubsw %%xmm3,%%xmm2 \n" |
michael@0 | 1213 | "pmaddubsw %%xmm3,%%xmm6 \n" |
michael@0 | 1214 | "phaddw %%xmm1,%%xmm0 \n" |
michael@0 | 1215 | "phaddw %%xmm6,%%xmm2 \n" |
michael@0 | 1216 | "psraw $0x8,%%xmm0 \n" |
michael@0 | 1217 | "psraw $0x8,%%xmm2 \n" |
michael@0 | 1218 | "packsswb %%xmm2,%%xmm0 \n" |
michael@0 | 1219 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 1220 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 1221 | BUNDLEALIGN |
michael@0 | 1222 | MEMOPMEM(movdqa,xmm0,0x00,1,2,1) // movdqa %%xmm0,(%1,%2,1) |
michael@0 | 1223 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 1224 | "jg 1b \n" |
michael@0 | 1225 | : "+r"(src_argb), // %0 |
michael@0 | 1226 | "+r"(dst_u), // %1 |
michael@0 | 1227 | "+r"(dst_v), // %2 |
michael@0 | 1228 | "+rm"(width) // %3 |
michael@0 | 1229 | : |
michael@0 | 1230 | : "memory", "cc" |
michael@0 | 1231 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 1232 | , "r14" |
michael@0 | 1233 | #endif |
michael@0 | 1234 | #if defined(__SSE2__) |
michael@0 | 1235 | , "xmm0", "xmm1", "xmm2", "xmm6" |
michael@0 | 1236 | #endif |
michael@0 | 1237 | ); |
michael@0 | 1238 | } |
michael@0 | 1239 | |
michael@0 | 1240 | void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u, |
michael@0 | 1241 | uint8* dst_v, int width) { |
michael@0 | 1242 | asm volatile ( |
michael@0 | 1243 | "movdqa %0,%%xmm4 \n" |
michael@0 | 1244 | "movdqa %1,%%xmm3 \n" |
michael@0 | 1245 | "movdqa %2,%%xmm5 \n" |
michael@0 | 1246 | : |
michael@0 | 1247 | : "m"(kARGBToU), // %0 |
michael@0 | 1248 | "m"(kARGBToV), // %1 |
michael@0 | 1249 | "m"(kAddUV128) // %2 |
michael@0 | 1250 | ); |
michael@0 | 1251 | asm volatile ( |
michael@0 | 1252 | "sub %1,%2 \n" |
michael@0 | 1253 | LABELALIGN |
michael@0 | 1254 | "1: \n" |
michael@0 | 1255 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1256 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1257 | "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1258 | "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
michael@0 | 1259 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 1260 | "pmaddubsw %%xmm4,%%xmm1 \n" |
michael@0 | 1261 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 1262 | "pmaddubsw %%xmm4,%%xmm6 \n" |
michael@0 | 1263 | "phaddw %%xmm1,%%xmm0 \n" |
michael@0 | 1264 | "phaddw %%xmm6,%%xmm2 \n" |
michael@0 | 1265 | "psraw $0x8,%%xmm0 \n" |
michael@0 | 1266 | "psraw $0x8,%%xmm2 \n" |
michael@0 | 1267 | "packsswb %%xmm2,%%xmm0 \n" |
michael@0 | 1268 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 1269 | "sub $0x10,%3 \n" |
michael@0 | 1270 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 1271 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1272 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1273 | "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1274 | "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
michael@0 | 1275 | "pmaddubsw %%xmm3,%%xmm0 \n" |
michael@0 | 1276 | "pmaddubsw %%xmm3,%%xmm1 \n" |
michael@0 | 1277 | "pmaddubsw %%xmm3,%%xmm2 \n" |
michael@0 | 1278 | "pmaddubsw %%xmm3,%%xmm6 \n" |
michael@0 | 1279 | "phaddw %%xmm1,%%xmm0 \n" |
michael@0 | 1280 | "phaddw %%xmm6,%%xmm2 \n" |
michael@0 | 1281 | "psraw $0x8,%%xmm0 \n" |
michael@0 | 1282 | "psraw $0x8,%%xmm2 \n" |
michael@0 | 1283 | "packsswb %%xmm2,%%xmm0 \n" |
michael@0 | 1284 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 1285 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 1286 | BUNDLEALIGN |
michael@0 | 1287 | MEMOPMEM(movdqu,xmm0,0x00,1,2,1) // movdqu %%xmm0,(%1,%2,1) |
michael@0 | 1288 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 1289 | "jg 1b \n" |
michael@0 | 1290 | : "+r"(src_argb), // %0 |
michael@0 | 1291 | "+r"(dst_u), // %1 |
michael@0 | 1292 | "+r"(dst_v), // %2 |
michael@0 | 1293 | "+rm"(width) // %3 |
michael@0 | 1294 | : |
michael@0 | 1295 | : "memory", "cc" |
michael@0 | 1296 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 1297 | , "r14" |
michael@0 | 1298 | #endif |
michael@0 | 1299 | #if defined(__SSE2__) |
michael@0 | 1300 | , "xmm0", "xmm1", "xmm2", "xmm6" |
michael@0 | 1301 | #endif |
michael@0 | 1302 | ); |
michael@0 | 1303 | } |
michael@0 | 1304 | |
michael@0 | 1305 | void ARGBToUV422Row_SSSE3(const uint8* src_argb0, |
michael@0 | 1306 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1307 | asm volatile ( |
michael@0 | 1308 | "movdqa %0,%%xmm4 \n" |
michael@0 | 1309 | "movdqa %1,%%xmm3 \n" |
michael@0 | 1310 | "movdqa %2,%%xmm5 \n" |
michael@0 | 1311 | : |
michael@0 | 1312 | : "m"(kARGBToU), // %0 |
michael@0 | 1313 | "m"(kARGBToV), // %1 |
michael@0 | 1314 | "m"(kAddUV128) // %2 |
michael@0 | 1315 | ); |
michael@0 | 1316 | asm volatile ( |
michael@0 | 1317 | "sub %1,%2 \n" |
michael@0 | 1318 | LABELALIGN |
michael@0 | 1319 | "1: \n" |
michael@0 | 1320 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1321 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1322 | "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1323 | "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
michael@0 | 1324 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 1325 | "movdqa %%xmm0,%%xmm7 \n" |
michael@0 | 1326 | "shufps $0x88,%%xmm1,%%xmm0 \n" |
michael@0 | 1327 | "shufps $0xdd,%%xmm1,%%xmm7 \n" |
michael@0 | 1328 | "pavgb %%xmm7,%%xmm0 \n" |
michael@0 | 1329 | "movdqa %%xmm2,%%xmm7 \n" |
michael@0 | 1330 | "shufps $0x88,%%xmm6,%%xmm2 \n" |
michael@0 | 1331 | "shufps $0xdd,%%xmm6,%%xmm7 \n" |
michael@0 | 1332 | "pavgb %%xmm7,%%xmm2 \n" |
michael@0 | 1333 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 1334 | "movdqa %%xmm2,%%xmm6 \n" |
michael@0 | 1335 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 1336 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 1337 | "pmaddubsw %%xmm3,%%xmm1 \n" |
michael@0 | 1338 | "pmaddubsw %%xmm3,%%xmm6 \n" |
michael@0 | 1339 | "phaddw %%xmm2,%%xmm0 \n" |
michael@0 | 1340 | "phaddw %%xmm6,%%xmm1 \n" |
michael@0 | 1341 | "psraw $0x8,%%xmm0 \n" |
michael@0 | 1342 | "psraw $0x8,%%xmm1 \n" |
michael@0 | 1343 | "packsswb %%xmm1,%%xmm0 \n" |
michael@0 | 1344 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 1345 | "sub $0x10,%3 \n" |
michael@0 | 1346 | "movlps %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 1347 | BUNDLEALIGN |
michael@0 | 1348 | MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
michael@0 | 1349 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 1350 | "jg 1b \n" |
michael@0 | 1351 | : "+r"(src_argb0), // %0 |
michael@0 | 1352 | "+r"(dst_u), // %1 |
michael@0 | 1353 | "+r"(dst_v), // %2 |
michael@0 | 1354 | "+rm"(width) // %3 |
michael@0 | 1355 | : |
michael@0 | 1356 | : "memory", "cc" |
michael@0 | 1357 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 1358 | , "r14" |
michael@0 | 1359 | #endif |
michael@0 | 1360 | #if defined(__SSE2__) |
michael@0 | 1361 | , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
michael@0 | 1362 | #endif |
michael@0 | 1363 | ); |
michael@0 | 1364 | } |
michael@0 | 1365 | |
michael@0 | 1366 | void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0, |
michael@0 | 1367 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1368 | asm volatile ( |
michael@0 | 1369 | "movdqa %0,%%xmm4 \n" |
michael@0 | 1370 | "movdqa %1,%%xmm3 \n" |
michael@0 | 1371 | "movdqa %2,%%xmm5 \n" |
michael@0 | 1372 | : |
michael@0 | 1373 | : "m"(kARGBToU), // %0 |
michael@0 | 1374 | "m"(kARGBToV), // %1 |
michael@0 | 1375 | "m"(kAddUV128) // %2 |
michael@0 | 1376 | ); |
michael@0 | 1377 | asm volatile ( |
michael@0 | 1378 | "sub %1,%2 \n" |
michael@0 | 1379 | LABELALIGN |
michael@0 | 1380 | "1: \n" |
michael@0 | 1381 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1382 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1383 | "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1384 | "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
michael@0 | 1385 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 1386 | "movdqa %%xmm0,%%xmm7 \n" |
michael@0 | 1387 | "shufps $0x88,%%xmm1,%%xmm0 \n" |
michael@0 | 1388 | "shufps $0xdd,%%xmm1,%%xmm7 \n" |
michael@0 | 1389 | "pavgb %%xmm7,%%xmm0 \n" |
michael@0 | 1390 | "movdqa %%xmm2,%%xmm7 \n" |
michael@0 | 1391 | "shufps $0x88,%%xmm6,%%xmm2 \n" |
michael@0 | 1392 | "shufps $0xdd,%%xmm6,%%xmm7 \n" |
michael@0 | 1393 | "pavgb %%xmm7,%%xmm2 \n" |
michael@0 | 1394 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 1395 | "movdqa %%xmm2,%%xmm6 \n" |
michael@0 | 1396 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 1397 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 1398 | "pmaddubsw %%xmm3,%%xmm1 \n" |
michael@0 | 1399 | "pmaddubsw %%xmm3,%%xmm6 \n" |
michael@0 | 1400 | "phaddw %%xmm2,%%xmm0 \n" |
michael@0 | 1401 | "phaddw %%xmm6,%%xmm1 \n" |
michael@0 | 1402 | "psraw $0x8,%%xmm0 \n" |
michael@0 | 1403 | "psraw $0x8,%%xmm1 \n" |
michael@0 | 1404 | "packsswb %%xmm1,%%xmm0 \n" |
michael@0 | 1405 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 1406 | "sub $0x10,%3 \n" |
michael@0 | 1407 | "movlps %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 1408 | BUNDLEALIGN |
michael@0 | 1409 | MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
michael@0 | 1410 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 1411 | "jg 1b \n" |
michael@0 | 1412 | : "+r"(src_argb0), // %0 |
michael@0 | 1413 | "+r"(dst_u), // %1 |
michael@0 | 1414 | "+r"(dst_v), // %2 |
michael@0 | 1415 | "+rm"(width) // %3 |
michael@0 | 1416 | : |
michael@0 | 1417 | : "memory", "cc" |
michael@0 | 1418 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 1419 | , "r14" |
michael@0 | 1420 | #endif |
michael@0 | 1421 | #if defined(__SSE2__) |
michael@0 | 1422 | , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
michael@0 | 1423 | #endif |
michael@0 | 1424 | ); |
michael@0 | 1425 | } |
michael@0 | 1426 | |
michael@0 | 1427 | void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { |
michael@0 | 1428 | asm volatile ( |
michael@0 | 1429 | "movdqa %4,%%xmm5 \n" |
michael@0 | 1430 | "movdqa %3,%%xmm4 \n" |
michael@0 | 1431 | LABELALIGN |
michael@0 | 1432 | "1: \n" |
michael@0 | 1433 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1434 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1435 | "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1436 | "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
michael@0 | 1437 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 1438 | "pmaddubsw %%xmm4,%%xmm1 \n" |
michael@0 | 1439 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 1440 | "pmaddubsw %%xmm4,%%xmm3 \n" |
michael@0 | 1441 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 1442 | "phaddw %%xmm1,%%xmm0 \n" |
michael@0 | 1443 | "phaddw %%xmm3,%%xmm2 \n" |
michael@0 | 1444 | "psrlw $0x7,%%xmm0 \n" |
michael@0 | 1445 | "psrlw $0x7,%%xmm2 \n" |
michael@0 | 1446 | "packuswb %%xmm2,%%xmm0 \n" |
michael@0 | 1447 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 1448 | "sub $0x10,%2 \n" |
michael@0 | 1449 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 1450 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 1451 | "jg 1b \n" |
michael@0 | 1452 | : "+r"(src_bgra), // %0 |
michael@0 | 1453 | "+r"(dst_y), // %1 |
michael@0 | 1454 | "+r"(pix) // %2 |
michael@0 | 1455 | : "m"(kBGRAToY), // %3 |
michael@0 | 1456 | "m"(kAddY16) // %4 |
michael@0 | 1457 | : "memory", "cc" |
michael@0 | 1458 | #if defined(__SSE2__) |
michael@0 | 1459 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 1460 | #endif |
michael@0 | 1461 | ); |
michael@0 | 1462 | } |
michael@0 | 1463 | |
michael@0 | 1464 | void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { |
michael@0 | 1465 | asm volatile ( |
michael@0 | 1466 | "movdqa %4,%%xmm5 \n" |
michael@0 | 1467 | "movdqa %3,%%xmm4 \n" |
michael@0 | 1468 | LABELALIGN |
michael@0 | 1469 | "1: \n" |
michael@0 | 1470 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1471 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1472 | "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1473 | "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
michael@0 | 1474 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 1475 | "pmaddubsw %%xmm4,%%xmm1 \n" |
michael@0 | 1476 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 1477 | "pmaddubsw %%xmm4,%%xmm3 \n" |
michael@0 | 1478 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 1479 | "phaddw %%xmm1,%%xmm0 \n" |
michael@0 | 1480 | "phaddw %%xmm3,%%xmm2 \n" |
michael@0 | 1481 | "psrlw $0x7,%%xmm0 \n" |
michael@0 | 1482 | "psrlw $0x7,%%xmm2 \n" |
michael@0 | 1483 | "packuswb %%xmm2,%%xmm0 \n" |
michael@0 | 1484 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 1485 | "sub $0x10,%2 \n" |
michael@0 | 1486 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 1487 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 1488 | "jg 1b \n" |
michael@0 | 1489 | : "+r"(src_bgra), // %0 |
michael@0 | 1490 | "+r"(dst_y), // %1 |
michael@0 | 1491 | "+r"(pix) // %2 |
michael@0 | 1492 | : "m"(kBGRAToY), // %3 |
michael@0 | 1493 | "m"(kAddY16) // %4 |
michael@0 | 1494 | : "memory", "cc" |
michael@0 | 1495 | #if defined(__SSE2__) |
michael@0 | 1496 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 1497 | #endif |
michael@0 | 1498 | ); |
michael@0 | 1499 | } |
michael@0 | 1500 | |
michael@0 | 1501 | void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, |
michael@0 | 1502 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1503 | asm volatile ( |
michael@0 | 1504 | "movdqa %0,%%xmm4 \n" |
michael@0 | 1505 | "movdqa %1,%%xmm3 \n" |
michael@0 | 1506 | "movdqa %2,%%xmm5 \n" |
michael@0 | 1507 | : |
michael@0 | 1508 | : "m"(kBGRAToU), // %0 |
michael@0 | 1509 | "m"(kBGRAToV), // %1 |
michael@0 | 1510 | "m"(kAddUV128) // %2 |
michael@0 | 1511 | ); |
michael@0 | 1512 | asm volatile ( |
michael@0 | 1513 | "sub %1,%2 \n" |
michael@0 | 1514 | LABELALIGN |
michael@0 | 1515 | "1: \n" |
michael@0 | 1516 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1517 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1518 | "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1519 | "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
michael@0 | 1520 | BUNDLEALIGN |
michael@0 | 1521 | MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 |
michael@0 | 1522 | MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 |
michael@0 | 1523 | MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 |
michael@0 | 1524 | MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 |
michael@0 | 1525 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 1526 | "movdqa %%xmm0,%%xmm7 \n" |
michael@0 | 1527 | "shufps $0x88,%%xmm1,%%xmm0 \n" |
michael@0 | 1528 | "shufps $0xdd,%%xmm1,%%xmm7 \n" |
michael@0 | 1529 | "pavgb %%xmm7,%%xmm0 \n" |
michael@0 | 1530 | "movdqa %%xmm2,%%xmm7 \n" |
michael@0 | 1531 | "shufps $0x88,%%xmm6,%%xmm2 \n" |
michael@0 | 1532 | "shufps $0xdd,%%xmm6,%%xmm7 \n" |
michael@0 | 1533 | "pavgb %%xmm7,%%xmm2 \n" |
michael@0 | 1534 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 1535 | "movdqa %%xmm2,%%xmm6 \n" |
michael@0 | 1536 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 1537 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 1538 | "pmaddubsw %%xmm3,%%xmm1 \n" |
michael@0 | 1539 | "pmaddubsw %%xmm3,%%xmm6 \n" |
michael@0 | 1540 | "phaddw %%xmm2,%%xmm0 \n" |
michael@0 | 1541 | "phaddw %%xmm6,%%xmm1 \n" |
michael@0 | 1542 | "psraw $0x8,%%xmm0 \n" |
michael@0 | 1543 | "psraw $0x8,%%xmm1 \n" |
michael@0 | 1544 | "packsswb %%xmm1,%%xmm0 \n" |
michael@0 | 1545 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 1546 | "sub $0x10,%3 \n" |
michael@0 | 1547 | "movlps %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 1548 | BUNDLEALIGN |
michael@0 | 1549 | MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
michael@0 | 1550 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 1551 | "jg 1b \n" |
michael@0 | 1552 | : "+r"(src_bgra0), // %0 |
michael@0 | 1553 | "+r"(dst_u), // %1 |
michael@0 | 1554 | "+r"(dst_v), // %2 |
michael@0 | 1555 | "+rm"(width) // %3 |
michael@0 | 1556 | : "r"((intptr_t)(src_stride_bgra)) // %4 |
michael@0 | 1557 | : "memory", "cc" |
michael@0 | 1558 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 1559 | , "r14" |
michael@0 | 1560 | #endif |
michael@0 | 1561 | #if defined(__SSE2__) |
michael@0 | 1562 | , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
michael@0 | 1563 | #endif |
michael@0 | 1564 | ); |
michael@0 | 1565 | } |
michael@0 | 1566 | |
michael@0 | 1567 | void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra, |
michael@0 | 1568 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1569 | asm volatile ( |
michael@0 | 1570 | "movdqa %0,%%xmm4 \n" |
michael@0 | 1571 | "movdqa %1,%%xmm3 \n" |
michael@0 | 1572 | "movdqa %2,%%xmm5 \n" |
michael@0 | 1573 | : |
michael@0 | 1574 | : "m"(kBGRAToU), // %0 |
michael@0 | 1575 | "m"(kBGRAToV), // %1 |
michael@0 | 1576 | "m"(kAddUV128) // %2 |
michael@0 | 1577 | ); |
michael@0 | 1578 | asm volatile ( |
michael@0 | 1579 | "sub %1,%2 \n" |
michael@0 | 1580 | LABELALIGN |
michael@0 | 1581 | "1: \n" |
michael@0 | 1582 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1583 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1584 | "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1585 | "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
michael@0 | 1586 | BUNDLEALIGN |
michael@0 | 1587 | MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
michael@0 | 1588 | "pavgb %%xmm7,%%xmm0 \n" |
michael@0 | 1589 | MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 |
michael@0 | 1590 | "pavgb %%xmm7,%%xmm1 \n" |
michael@0 | 1591 | MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 |
michael@0 | 1592 | "pavgb %%xmm7,%%xmm2 \n" |
michael@0 | 1593 | MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 |
michael@0 | 1594 | "pavgb %%xmm7,%%xmm6 \n" |
michael@0 | 1595 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 1596 | "movdqa %%xmm0,%%xmm7 \n" |
michael@0 | 1597 | "shufps $0x88,%%xmm1,%%xmm0 \n" |
michael@0 | 1598 | "shufps $0xdd,%%xmm1,%%xmm7 \n" |
michael@0 | 1599 | "pavgb %%xmm7,%%xmm0 \n" |
michael@0 | 1600 | "movdqa %%xmm2,%%xmm7 \n" |
michael@0 | 1601 | "shufps $0x88,%%xmm6,%%xmm2 \n" |
michael@0 | 1602 | "shufps $0xdd,%%xmm6,%%xmm7 \n" |
michael@0 | 1603 | "pavgb %%xmm7,%%xmm2 \n" |
michael@0 | 1604 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 1605 | "movdqa %%xmm2,%%xmm6 \n" |
michael@0 | 1606 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 1607 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 1608 | "pmaddubsw %%xmm3,%%xmm1 \n" |
michael@0 | 1609 | "pmaddubsw %%xmm3,%%xmm6 \n" |
michael@0 | 1610 | "phaddw %%xmm2,%%xmm0 \n" |
michael@0 | 1611 | "phaddw %%xmm6,%%xmm1 \n" |
michael@0 | 1612 | "psraw $0x8,%%xmm0 \n" |
michael@0 | 1613 | "psraw $0x8,%%xmm1 \n" |
michael@0 | 1614 | "packsswb %%xmm1,%%xmm0 \n" |
michael@0 | 1615 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 1616 | "sub $0x10,%3 \n" |
michael@0 | 1617 | "movlps %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 1618 | BUNDLEALIGN |
michael@0 | 1619 | MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
michael@0 | 1620 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 1621 | "jg 1b \n" |
michael@0 | 1622 | : "+r"(src_bgra0), // %0 |
michael@0 | 1623 | "+r"(dst_u), // %1 |
michael@0 | 1624 | "+r"(dst_v), // %2 |
michael@0 | 1625 | "+rm"(width) // %3 |
michael@0 | 1626 | : "r"((intptr_t)(src_stride_bgra)) // %4 |
michael@0 | 1627 | : "memory", "cc" |
michael@0 | 1628 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 1629 | , "r14" |
michael@0 | 1630 | #endif |
michael@0 | 1631 | #if defined(__SSE2__) |
michael@0 | 1632 | , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
michael@0 | 1633 | #endif |
michael@0 | 1634 | ); |
michael@0 | 1635 | } |
michael@0 | 1636 | |
michael@0 | 1637 | void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { |
michael@0 | 1638 | asm volatile ( |
michael@0 | 1639 | "movdqa %4,%%xmm5 \n" |
michael@0 | 1640 | "movdqa %3,%%xmm4 \n" |
michael@0 | 1641 | LABELALIGN |
michael@0 | 1642 | "1: \n" |
michael@0 | 1643 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1644 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1645 | "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1646 | "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
michael@0 | 1647 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 1648 | "pmaddubsw %%xmm4,%%xmm1 \n" |
michael@0 | 1649 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 1650 | "pmaddubsw %%xmm4,%%xmm3 \n" |
michael@0 | 1651 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 1652 | "phaddw %%xmm1,%%xmm0 \n" |
michael@0 | 1653 | "phaddw %%xmm3,%%xmm2 \n" |
michael@0 | 1654 | "psrlw $0x7,%%xmm0 \n" |
michael@0 | 1655 | "psrlw $0x7,%%xmm2 \n" |
michael@0 | 1656 | "packuswb %%xmm2,%%xmm0 \n" |
michael@0 | 1657 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 1658 | "sub $0x10,%2 \n" |
michael@0 | 1659 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 1660 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 1661 | "jg 1b \n" |
michael@0 | 1662 | : "+r"(src_abgr), // %0 |
michael@0 | 1663 | "+r"(dst_y), // %1 |
michael@0 | 1664 | "+r"(pix) // %2 |
michael@0 | 1665 | : "m"(kABGRToY), // %3 |
michael@0 | 1666 | "m"(kAddY16) // %4 |
michael@0 | 1667 | : "memory", "cc" |
michael@0 | 1668 | #if defined(__SSE2__) |
michael@0 | 1669 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 1670 | #endif |
michael@0 | 1671 | ); |
michael@0 | 1672 | } |
michael@0 | 1673 | |
michael@0 | 1674 | void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { |
michael@0 | 1675 | asm volatile ( |
michael@0 | 1676 | "movdqa %4,%%xmm5 \n" |
michael@0 | 1677 | "movdqa %3,%%xmm4 \n" |
michael@0 | 1678 | LABELALIGN |
michael@0 | 1679 | "1: \n" |
michael@0 | 1680 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1681 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1682 | "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1683 | "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
michael@0 | 1684 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 1685 | "pmaddubsw %%xmm4,%%xmm1 \n" |
michael@0 | 1686 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 1687 | "pmaddubsw %%xmm4,%%xmm3 \n" |
michael@0 | 1688 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 1689 | "phaddw %%xmm1,%%xmm0 \n" |
michael@0 | 1690 | "phaddw %%xmm3,%%xmm2 \n" |
michael@0 | 1691 | "psrlw $0x7,%%xmm0 \n" |
michael@0 | 1692 | "psrlw $0x7,%%xmm2 \n" |
michael@0 | 1693 | "packuswb %%xmm2,%%xmm0 \n" |
michael@0 | 1694 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 1695 | "sub $0x10,%2 \n" |
michael@0 | 1696 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 1697 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 1698 | "jg 1b \n" |
michael@0 | 1699 | : "+r"(src_abgr), // %0 |
michael@0 | 1700 | "+r"(dst_y), // %1 |
michael@0 | 1701 | "+r"(pix) // %2 |
michael@0 | 1702 | : "m"(kABGRToY), // %3 |
michael@0 | 1703 | "m"(kAddY16) // %4 |
michael@0 | 1704 | : "memory", "cc" |
michael@0 | 1705 | #if defined(__SSE2__) |
michael@0 | 1706 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 1707 | #endif |
michael@0 | 1708 | ); |
michael@0 | 1709 | } |
michael@0 | 1710 | |
michael@0 | 1711 | void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { |
michael@0 | 1712 | asm volatile ( |
michael@0 | 1713 | "movdqa %4,%%xmm5 \n" |
michael@0 | 1714 | "movdqa %3,%%xmm4 \n" |
michael@0 | 1715 | LABELALIGN |
michael@0 | 1716 | "1: \n" |
michael@0 | 1717 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1718 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1719 | "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1720 | "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
michael@0 | 1721 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 1722 | "pmaddubsw %%xmm4,%%xmm1 \n" |
michael@0 | 1723 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 1724 | "pmaddubsw %%xmm4,%%xmm3 \n" |
michael@0 | 1725 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 1726 | "phaddw %%xmm1,%%xmm0 \n" |
michael@0 | 1727 | "phaddw %%xmm3,%%xmm2 \n" |
michael@0 | 1728 | "psrlw $0x7,%%xmm0 \n" |
michael@0 | 1729 | "psrlw $0x7,%%xmm2 \n" |
michael@0 | 1730 | "packuswb %%xmm2,%%xmm0 \n" |
michael@0 | 1731 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 1732 | "sub $0x10,%2 \n" |
michael@0 | 1733 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 1734 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 1735 | "jg 1b \n" |
michael@0 | 1736 | : "+r"(src_rgba), // %0 |
michael@0 | 1737 | "+r"(dst_y), // %1 |
michael@0 | 1738 | "+r"(pix) // %2 |
michael@0 | 1739 | : "m"(kRGBAToY), // %3 |
michael@0 | 1740 | "m"(kAddY16) // %4 |
michael@0 | 1741 | : "memory", "cc" |
michael@0 | 1742 | #if defined(__SSE2__) |
michael@0 | 1743 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 1744 | #endif |
michael@0 | 1745 | ); |
michael@0 | 1746 | } |
michael@0 | 1747 | |
michael@0 | 1748 | void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) { |
michael@0 | 1749 | asm volatile ( |
michael@0 | 1750 | "movdqa %4,%%xmm5 \n" |
michael@0 | 1751 | "movdqa %3,%%xmm4 \n" |
michael@0 | 1752 | LABELALIGN |
michael@0 | 1753 | "1: \n" |
michael@0 | 1754 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1755 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1756 | "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1757 | "movdqu " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
michael@0 | 1758 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 1759 | "pmaddubsw %%xmm4,%%xmm1 \n" |
michael@0 | 1760 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 1761 | "pmaddubsw %%xmm4,%%xmm3 \n" |
michael@0 | 1762 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 1763 | "phaddw %%xmm1,%%xmm0 \n" |
michael@0 | 1764 | "phaddw %%xmm3,%%xmm2 \n" |
michael@0 | 1765 | "psrlw $0x7,%%xmm0 \n" |
michael@0 | 1766 | "psrlw $0x7,%%xmm2 \n" |
michael@0 | 1767 | "packuswb %%xmm2,%%xmm0 \n" |
michael@0 | 1768 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 1769 | "sub $0x10,%2 \n" |
michael@0 | 1770 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 1771 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 1772 | "jg 1b \n" |
michael@0 | 1773 | : "+r"(src_rgba), // %0 |
michael@0 | 1774 | "+r"(dst_y), // %1 |
michael@0 | 1775 | "+r"(pix) // %2 |
michael@0 | 1776 | : "m"(kRGBAToY), // %3 |
michael@0 | 1777 | "m"(kAddY16) // %4 |
michael@0 | 1778 | : "memory", "cc" |
michael@0 | 1779 | #if defined(__SSE2__) |
michael@0 | 1780 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 1781 | #endif |
michael@0 | 1782 | ); |
michael@0 | 1783 | } |
michael@0 | 1784 | |
michael@0 | 1785 | void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, |
michael@0 | 1786 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1787 | asm volatile ( |
michael@0 | 1788 | "movdqa %0,%%xmm4 \n" |
michael@0 | 1789 | "movdqa %1,%%xmm3 \n" |
michael@0 | 1790 | "movdqa %2,%%xmm5 \n" |
michael@0 | 1791 | : |
michael@0 | 1792 | : "m"(kABGRToU), // %0 |
michael@0 | 1793 | "m"(kABGRToV), // %1 |
michael@0 | 1794 | "m"(kAddUV128) // %2 |
michael@0 | 1795 | ); |
michael@0 | 1796 | asm volatile ( |
michael@0 | 1797 | "sub %1,%2 \n" |
michael@0 | 1798 | LABELALIGN |
michael@0 | 1799 | "1: \n" |
michael@0 | 1800 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1801 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1802 | "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1803 | "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
michael@0 | 1804 | BUNDLEALIGN |
michael@0 | 1805 | MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 |
michael@0 | 1806 | MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 |
michael@0 | 1807 | MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 |
michael@0 | 1808 | MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 |
michael@0 | 1809 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 1810 | "movdqa %%xmm0,%%xmm7 \n" |
michael@0 | 1811 | "shufps $0x88,%%xmm1,%%xmm0 \n" |
michael@0 | 1812 | "shufps $0xdd,%%xmm1,%%xmm7 \n" |
michael@0 | 1813 | "pavgb %%xmm7,%%xmm0 \n" |
michael@0 | 1814 | "movdqa %%xmm2,%%xmm7 \n" |
michael@0 | 1815 | "shufps $0x88,%%xmm6,%%xmm2 \n" |
michael@0 | 1816 | "shufps $0xdd,%%xmm6,%%xmm7 \n" |
michael@0 | 1817 | "pavgb %%xmm7,%%xmm2 \n" |
michael@0 | 1818 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 1819 | "movdqa %%xmm2,%%xmm6 \n" |
michael@0 | 1820 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 1821 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 1822 | "pmaddubsw %%xmm3,%%xmm1 \n" |
michael@0 | 1823 | "pmaddubsw %%xmm3,%%xmm6 \n" |
michael@0 | 1824 | "phaddw %%xmm2,%%xmm0 \n" |
michael@0 | 1825 | "phaddw %%xmm6,%%xmm1 \n" |
michael@0 | 1826 | "psraw $0x8,%%xmm0 \n" |
michael@0 | 1827 | "psraw $0x8,%%xmm1 \n" |
michael@0 | 1828 | "packsswb %%xmm1,%%xmm0 \n" |
michael@0 | 1829 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 1830 | "sub $0x10,%3 \n" |
michael@0 | 1831 | "movlps %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 1832 | BUNDLEALIGN |
michael@0 | 1833 | MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
michael@0 | 1834 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 1835 | "jg 1b \n" |
michael@0 | 1836 | : "+r"(src_abgr0), // %0 |
michael@0 | 1837 | "+r"(dst_u), // %1 |
michael@0 | 1838 | "+r"(dst_v), // %2 |
michael@0 | 1839 | "+rm"(width) // %3 |
michael@0 | 1840 | : "r"((intptr_t)(src_stride_abgr)) // %4 |
michael@0 | 1841 | : "memory", "cc" |
michael@0 | 1842 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 1843 | , "r14" |
michael@0 | 1844 | #endif |
michael@0 | 1845 | #if defined(__SSE2__) |
michael@0 | 1846 | , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
michael@0 | 1847 | #endif |
michael@0 | 1848 | ); |
michael@0 | 1849 | } |
michael@0 | 1850 | |
michael@0 | 1851 | void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, |
michael@0 | 1852 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1853 | asm volatile ( |
michael@0 | 1854 | "movdqa %0,%%xmm4 \n" |
michael@0 | 1855 | "movdqa %1,%%xmm3 \n" |
michael@0 | 1856 | "movdqa %2,%%xmm5 \n" |
michael@0 | 1857 | : |
michael@0 | 1858 | : "m"(kABGRToU), // %0 |
michael@0 | 1859 | "m"(kABGRToV), // %1 |
michael@0 | 1860 | "m"(kAddUV128) // %2 |
michael@0 | 1861 | ); |
michael@0 | 1862 | asm volatile ( |
michael@0 | 1863 | "sub %1,%2 \n" |
michael@0 | 1864 | LABELALIGN |
michael@0 | 1865 | "1: \n" |
michael@0 | 1866 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1867 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1868 | "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1869 | "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
michael@0 | 1870 | BUNDLEALIGN |
michael@0 | 1871 | MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
michael@0 | 1872 | "pavgb %%xmm7,%%xmm0 \n" |
michael@0 | 1873 | MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 |
michael@0 | 1874 | "pavgb %%xmm7,%%xmm1 \n" |
michael@0 | 1875 | MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 |
michael@0 | 1876 | "pavgb %%xmm7,%%xmm2 \n" |
michael@0 | 1877 | MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 |
michael@0 | 1878 | "pavgb %%xmm7,%%xmm6 \n" |
michael@0 | 1879 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 1880 | "movdqa %%xmm0,%%xmm7 \n" |
michael@0 | 1881 | "shufps $0x88,%%xmm1,%%xmm0 \n" |
michael@0 | 1882 | "shufps $0xdd,%%xmm1,%%xmm7 \n" |
michael@0 | 1883 | "pavgb %%xmm7,%%xmm0 \n" |
michael@0 | 1884 | "movdqa %%xmm2,%%xmm7 \n" |
michael@0 | 1885 | "shufps $0x88,%%xmm6,%%xmm2 \n" |
michael@0 | 1886 | "shufps $0xdd,%%xmm6,%%xmm7 \n" |
michael@0 | 1887 | "pavgb %%xmm7,%%xmm2 \n" |
michael@0 | 1888 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 1889 | "movdqa %%xmm2,%%xmm6 \n" |
michael@0 | 1890 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 1891 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 1892 | "pmaddubsw %%xmm3,%%xmm1 \n" |
michael@0 | 1893 | "pmaddubsw %%xmm3,%%xmm6 \n" |
michael@0 | 1894 | "phaddw %%xmm2,%%xmm0 \n" |
michael@0 | 1895 | "phaddw %%xmm6,%%xmm1 \n" |
michael@0 | 1896 | "psraw $0x8,%%xmm0 \n" |
michael@0 | 1897 | "psraw $0x8,%%xmm1 \n" |
michael@0 | 1898 | "packsswb %%xmm1,%%xmm0 \n" |
michael@0 | 1899 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 1900 | "sub $0x10,%3 \n" |
michael@0 | 1901 | "movlps %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 1902 | BUNDLEALIGN |
michael@0 | 1903 | MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
michael@0 | 1904 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 1905 | "jg 1b \n" |
michael@0 | 1906 | : "+r"(src_abgr0), // %0 |
michael@0 | 1907 | "+r"(dst_u), // %1 |
michael@0 | 1908 | "+r"(dst_v), // %2 |
michael@0 | 1909 | "+rm"(width) // %3 |
michael@0 | 1910 | : "r"((intptr_t)(src_stride_abgr)) // %4 |
michael@0 | 1911 | : "memory", "cc" |
michael@0 | 1912 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 1913 | , "r14" |
michael@0 | 1914 | #endif |
michael@0 | 1915 | #if defined(__SSE2__) |
michael@0 | 1916 | , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
michael@0 | 1917 | #endif |
michael@0 | 1918 | ); |
michael@0 | 1919 | } |
michael@0 | 1920 | |
michael@0 | 1921 | void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, |
michael@0 | 1922 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1923 | asm volatile ( |
michael@0 | 1924 | "movdqa %0,%%xmm4 \n" |
michael@0 | 1925 | "movdqa %1,%%xmm3 \n" |
michael@0 | 1926 | "movdqa %2,%%xmm5 \n" |
michael@0 | 1927 | : |
michael@0 | 1928 | : "m"(kRGBAToU), // %0 |
michael@0 | 1929 | "m"(kRGBAToV), // %1 |
michael@0 | 1930 | "m"(kAddUV128) // %2 |
michael@0 | 1931 | ); |
michael@0 | 1932 | asm volatile ( |
michael@0 | 1933 | "sub %1,%2 \n" |
michael@0 | 1934 | LABELALIGN |
michael@0 | 1935 | "1: \n" |
michael@0 | 1936 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1937 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 1938 | "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 1939 | "movdqa " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
michael@0 | 1940 | BUNDLEALIGN |
michael@0 | 1941 | MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 |
michael@0 | 1942 | MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 |
michael@0 | 1943 | MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 |
michael@0 | 1944 | MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 |
michael@0 | 1945 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 1946 | "movdqa %%xmm0,%%xmm7 \n" |
michael@0 | 1947 | "shufps $0x88,%%xmm1,%%xmm0 \n" |
michael@0 | 1948 | "shufps $0xdd,%%xmm1,%%xmm7 \n" |
michael@0 | 1949 | "pavgb %%xmm7,%%xmm0 \n" |
michael@0 | 1950 | "movdqa %%xmm2,%%xmm7 \n" |
michael@0 | 1951 | "shufps $0x88,%%xmm6,%%xmm2 \n" |
michael@0 | 1952 | "shufps $0xdd,%%xmm6,%%xmm7 \n" |
michael@0 | 1953 | "pavgb %%xmm7,%%xmm2 \n" |
michael@0 | 1954 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 1955 | "movdqa %%xmm2,%%xmm6 \n" |
michael@0 | 1956 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 1957 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 1958 | "pmaddubsw %%xmm3,%%xmm1 \n" |
michael@0 | 1959 | "pmaddubsw %%xmm3,%%xmm6 \n" |
michael@0 | 1960 | "phaddw %%xmm2,%%xmm0 \n" |
michael@0 | 1961 | "phaddw %%xmm6,%%xmm1 \n" |
michael@0 | 1962 | "psraw $0x8,%%xmm0 \n" |
michael@0 | 1963 | "psraw $0x8,%%xmm1 \n" |
michael@0 | 1964 | "packsswb %%xmm1,%%xmm0 \n" |
michael@0 | 1965 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 1966 | "sub $0x10,%3 \n" |
michael@0 | 1967 | "movlps %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 1968 | BUNDLEALIGN |
michael@0 | 1969 | MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
michael@0 | 1970 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 1971 | "jg 1b \n" |
michael@0 | 1972 | : "+r"(src_rgba0), // %0 |
michael@0 | 1973 | "+r"(dst_u), // %1 |
michael@0 | 1974 | "+r"(dst_v), // %2 |
michael@0 | 1975 | "+rm"(width) // %3 |
michael@0 | 1976 | : "r"((intptr_t)(src_stride_rgba)) |
michael@0 | 1977 | : "memory", "cc" |
michael@0 | 1978 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 1979 | , "r14" |
michael@0 | 1980 | #endif |
michael@0 | 1981 | #if defined(__SSE2__) |
michael@0 | 1982 | , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
michael@0 | 1983 | #endif |
michael@0 | 1984 | ); |
michael@0 | 1985 | } |
michael@0 | 1986 | |
michael@0 | 1987 | void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba, |
michael@0 | 1988 | uint8* dst_u, uint8* dst_v, int width) { |
michael@0 | 1989 | asm volatile ( |
michael@0 | 1990 | "movdqa %0,%%xmm4 \n" |
michael@0 | 1991 | "movdqa %1,%%xmm3 \n" |
michael@0 | 1992 | "movdqa %2,%%xmm5 \n" |
michael@0 | 1993 | : |
michael@0 | 1994 | : "m"(kRGBAToU), // %0 |
michael@0 | 1995 | "m"(kRGBAToV), // %1 |
michael@0 | 1996 | "m"(kAddUV128) // %2 |
michael@0 | 1997 | ); |
michael@0 | 1998 | asm volatile ( |
michael@0 | 1999 | "sub %1,%2 \n" |
michael@0 | 2000 | LABELALIGN |
michael@0 | 2001 | "1: \n" |
michael@0 | 2002 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 2003 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 2004 | "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 2005 | "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" |
michael@0 | 2006 | BUNDLEALIGN |
michael@0 | 2007 | MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 |
michael@0 | 2008 | "pavgb %%xmm7,%%xmm0 \n" |
michael@0 | 2009 | MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 |
michael@0 | 2010 | "pavgb %%xmm7,%%xmm1 \n" |
michael@0 | 2011 | MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 |
michael@0 | 2012 | "pavgb %%xmm7,%%xmm2 \n" |
michael@0 | 2013 | MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 |
michael@0 | 2014 | "pavgb %%xmm7,%%xmm6 \n" |
michael@0 | 2015 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 2016 | "movdqa %%xmm0,%%xmm7 \n" |
michael@0 | 2017 | "shufps $0x88,%%xmm1,%%xmm0 \n" |
michael@0 | 2018 | "shufps $0xdd,%%xmm1,%%xmm7 \n" |
michael@0 | 2019 | "pavgb %%xmm7,%%xmm0 \n" |
michael@0 | 2020 | "movdqa %%xmm2,%%xmm7 \n" |
michael@0 | 2021 | "shufps $0x88,%%xmm6,%%xmm2 \n" |
michael@0 | 2022 | "shufps $0xdd,%%xmm6,%%xmm7 \n" |
michael@0 | 2023 | "pavgb %%xmm7,%%xmm2 \n" |
michael@0 | 2024 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 2025 | "movdqa %%xmm2,%%xmm6 \n" |
michael@0 | 2026 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 2027 | "pmaddubsw %%xmm4,%%xmm2 \n" |
michael@0 | 2028 | "pmaddubsw %%xmm3,%%xmm1 \n" |
michael@0 | 2029 | "pmaddubsw %%xmm3,%%xmm6 \n" |
michael@0 | 2030 | "phaddw %%xmm2,%%xmm0 \n" |
michael@0 | 2031 | "phaddw %%xmm6,%%xmm1 \n" |
michael@0 | 2032 | "psraw $0x8,%%xmm0 \n" |
michael@0 | 2033 | "psraw $0x8,%%xmm1 \n" |
michael@0 | 2034 | "packsswb %%xmm1,%%xmm0 \n" |
michael@0 | 2035 | "paddb %%xmm5,%%xmm0 \n" |
michael@0 | 2036 | "sub $0x10,%3 \n" |
michael@0 | 2037 | "movlps %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 2038 | BUNDLEALIGN |
michael@0 | 2039 | MEMOPMEM(movhps,xmm0,0x00,1,2,1) // movhps %%xmm0,(%1,%2,1) |
michael@0 | 2040 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 2041 | "jg 1b \n" |
michael@0 | 2042 | : "+r"(src_rgba0), // %0 |
michael@0 | 2043 | "+r"(dst_u), // %1 |
michael@0 | 2044 | "+r"(dst_v), // %2 |
michael@0 | 2045 | "+rm"(width) // %3 |
michael@0 | 2046 | : "r"((intptr_t)(src_stride_rgba)) // %4 |
michael@0 | 2047 | : "memory", "cc" |
michael@0 | 2048 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 2049 | , "r14" |
michael@0 | 2050 | #endif |
michael@0 | 2051 | #if defined(__SSE2__) |
michael@0 | 2052 | , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
michael@0 | 2053 | #endif |
michael@0 | 2054 | ); |
michael@0 | 2055 | } |
michael@0 | 2056 | #endif // HAS_ARGBTOUVROW_SSSE3 |
michael@0 | 2057 | |
michael@0 | 2058 | #ifdef HAS_I422TOARGBROW_SSSE3 |
michael@0 | 2059 | #define UB 127 /* min(63,(int8)(2.018 * 64)) */ |
michael@0 | 2060 | #define UG -25 /* (int8)(-0.391 * 64 - 0.5) */ |
michael@0 | 2061 | #define UR 0 |
michael@0 | 2062 | |
michael@0 | 2063 | #define VB 0 |
michael@0 | 2064 | #define VG -52 /* (int8)(-0.813 * 64 - 0.5) */ |
michael@0 | 2065 | #define VR 102 /* (int8)(1.596 * 64 + 0.5) */ |
michael@0 | 2066 | |
michael@0 | 2067 | // Bias |
michael@0 | 2068 | #define BB UB * 128 + VB * 128 |
michael@0 | 2069 | #define BG UG * 128 + VG * 128 |
michael@0 | 2070 | #define BR UR * 128 + VR * 128 |
michael@0 | 2071 | |
michael@0 | 2072 | #define YG 74 /* (int8)(1.164 * 64 + 0.5) */ |
michael@0 | 2073 | |
michael@0 | 2074 | struct { |
michael@0 | 2075 | vec8 kUVToB; // 0 |
michael@0 | 2076 | vec8 kUVToG; // 16 |
michael@0 | 2077 | vec8 kUVToR; // 32 |
michael@0 | 2078 | vec16 kUVBiasB; // 48 |
michael@0 | 2079 | vec16 kUVBiasG; // 64 |
michael@0 | 2080 | vec16 kUVBiasR; // 80 |
michael@0 | 2081 | vec16 kYSub16; // 96 |
michael@0 | 2082 | vec16 kYToRgb; // 112 |
michael@0 | 2083 | vec8 kVUToB; // 128 |
michael@0 | 2084 | vec8 kVUToG; // 144 |
michael@0 | 2085 | vec8 kVUToR; // 160 |
michael@0 | 2086 | } static SIMD_ALIGNED(kYuvConstants) = { |
michael@0 | 2087 | { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB }, |
michael@0 | 2088 | { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, |
michael@0 | 2089 | { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR }, |
michael@0 | 2090 | { BB, BB, BB, BB, BB, BB, BB, BB }, |
michael@0 | 2091 | { BG, BG, BG, BG, BG, BG, BG, BG }, |
michael@0 | 2092 | { BR, BR, BR, BR, BR, BR, BR, BR }, |
michael@0 | 2093 | { 16, 16, 16, 16, 16, 16, 16, 16 }, |
michael@0 | 2094 | { YG, YG, YG, YG, YG, YG, YG, YG }, |
michael@0 | 2095 | { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB }, |
michael@0 | 2096 | { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, |
michael@0 | 2097 | { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR } |
michael@0 | 2098 | }; |
michael@0 | 2099 | |
michael@0 | 2100 | |
michael@0 | 2101 | // Read 8 UV from 411 |
michael@0 | 2102 | #define READYUV444 \ |
michael@0 | 2103 | "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
michael@0 | 2104 | BUNDLEALIGN \ |
michael@0 | 2105 | MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
michael@0 | 2106 | "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ |
michael@0 | 2107 | "punpcklbw %%xmm1,%%xmm0 \n" |
michael@0 | 2108 | |
michael@0 | 2109 | // Read 4 UV from 422, upsample to 8 UV |
michael@0 | 2110 | #define READYUV422 \ |
michael@0 | 2111 | "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
michael@0 | 2112 | BUNDLEALIGN \ |
michael@0 | 2113 | MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
michael@0 | 2114 | "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ |
michael@0 | 2115 | "punpcklbw %%xmm1,%%xmm0 \n" \ |
michael@0 | 2116 | "punpcklwd %%xmm0,%%xmm0 \n" |
michael@0 | 2117 | |
michael@0 | 2118 | // Read 2 UV from 411, upsample to 8 UV |
michael@0 | 2119 | #define READYUV411 \ |
michael@0 | 2120 | "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
michael@0 | 2121 | BUNDLEALIGN \ |
michael@0 | 2122 | MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
michael@0 | 2123 | "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ |
michael@0 | 2124 | "punpcklbw %%xmm1,%%xmm0 \n" \ |
michael@0 | 2125 | "punpcklwd %%xmm0,%%xmm0 \n" \ |
michael@0 | 2126 | "punpckldq %%xmm0,%%xmm0 \n" |
michael@0 | 2127 | |
michael@0 | 2128 | // Read 4 UV from NV12, upsample to 8 UV |
michael@0 | 2129 | #define READNV12 \ |
michael@0 | 2130 | "movq " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
michael@0 | 2131 | "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ |
michael@0 | 2132 | "punpcklwd %%xmm0,%%xmm0 \n" |
michael@0 | 2133 | |
michael@0 | 2134 | // Convert 8 pixels: 8 UV and 8 Y |
michael@0 | 2135 | #define YUVTORGB \ |
michael@0 | 2136 | "movdqa %%xmm0,%%xmm1 \n" \ |
michael@0 | 2137 | "movdqa %%xmm0,%%xmm2 \n" \ |
michael@0 | 2138 | "pmaddubsw " MEMACCESS([kYuvConstants]) ",%%xmm0 \n" \ |
michael@0 | 2139 | "pmaddubsw " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1 \n" \ |
michael@0 | 2140 | "pmaddubsw " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2 \n" \ |
michael@0 | 2141 | "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \ |
michael@0 | 2142 | "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \ |
michael@0 | 2143 | "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \ |
michael@0 | 2144 | "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ |
michael@0 | 2145 | "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ |
michael@0 | 2146 | "punpcklbw %%xmm4,%%xmm3 \n" \ |
michael@0 | 2147 | "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ |
michael@0 | 2148 | "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \ |
michael@0 | 2149 | "paddsw %%xmm3,%%xmm0 \n" \ |
michael@0 | 2150 | "paddsw %%xmm3,%%xmm1 \n" \ |
michael@0 | 2151 | "paddsw %%xmm3,%%xmm2 \n" \ |
michael@0 | 2152 | "psraw $0x6,%%xmm0 \n" \ |
michael@0 | 2153 | "psraw $0x6,%%xmm1 \n" \ |
michael@0 | 2154 | "psraw $0x6,%%xmm2 \n" \ |
michael@0 | 2155 | "packuswb %%xmm0,%%xmm0 \n" \ |
michael@0 | 2156 | "packuswb %%xmm1,%%xmm1 \n" \ |
michael@0 | 2157 | "packuswb %%xmm2,%%xmm2 \n" |
michael@0 | 2158 | |
michael@0 | 2159 | // Convert 8 pixels: 8 VU and 8 Y |
michael@0 | 2160 | #define YVUTORGB \ |
michael@0 | 2161 | "movdqa %%xmm0,%%xmm1 \n" \ |
michael@0 | 2162 | "movdqa %%xmm0,%%xmm2 \n" \ |
michael@0 | 2163 | "pmaddubsw " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0 \n" \ |
michael@0 | 2164 | "pmaddubsw " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1 \n" \ |
michael@0 | 2165 | "pmaddubsw " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2 \n" \ |
michael@0 | 2166 | "psubw " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0 \n" \ |
michael@0 | 2167 | "psubw " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1 \n" \ |
michael@0 | 2168 | "psubw " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2 \n" \ |
michael@0 | 2169 | "movq " MEMACCESS([y_buf]) ",%%xmm3 \n" \ |
michael@0 | 2170 | "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" \ |
michael@0 | 2171 | "punpcklbw %%xmm4,%%xmm3 \n" \ |
michael@0 | 2172 | "psubsw " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3 \n" \ |
michael@0 | 2173 | "pmullw " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3 \n" \ |
michael@0 | 2174 | "paddsw %%xmm3,%%xmm0 \n" \ |
michael@0 | 2175 | "paddsw %%xmm3,%%xmm1 \n" \ |
michael@0 | 2176 | "paddsw %%xmm3,%%xmm2 \n" \ |
michael@0 | 2177 | "psraw $0x6,%%xmm0 \n" \ |
michael@0 | 2178 | "psraw $0x6,%%xmm1 \n" \ |
michael@0 | 2179 | "psraw $0x6,%%xmm2 \n" \ |
michael@0 | 2180 | "packuswb %%xmm0,%%xmm0 \n" \ |
michael@0 | 2181 | "packuswb %%xmm1,%%xmm1 \n" \ |
michael@0 | 2182 | "packuswb %%xmm2,%%xmm2 \n" |
michael@0 | 2183 | |
michael@0 | 2184 | void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, |
michael@0 | 2185 | const uint8* u_buf, |
michael@0 | 2186 | const uint8* v_buf, |
michael@0 | 2187 | uint8* dst_argb, |
michael@0 | 2188 | int width) { |
michael@0 | 2189 | asm volatile ( |
michael@0 | 2190 | "sub %[u_buf],%[v_buf] \n" |
michael@0 | 2191 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2192 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 2193 | LABELALIGN |
michael@0 | 2194 | "1: \n" |
michael@0 | 2195 | READYUV444 |
michael@0 | 2196 | YUVTORGB |
michael@0 | 2197 | "punpcklbw %%xmm1,%%xmm0 \n" |
michael@0 | 2198 | "punpcklbw %%xmm5,%%xmm2 \n" |
michael@0 | 2199 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 2200 | "punpcklwd %%xmm2,%%xmm0 \n" |
michael@0 | 2201 | "punpckhwd %%xmm2,%%xmm1 \n" |
michael@0 | 2202 | "movdqa %%xmm0," MEMACCESS([dst_argb]) " \n" |
michael@0 | 2203 | "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" |
michael@0 | 2204 | "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
michael@0 | 2205 | "sub $0x8,%[width] \n" |
michael@0 | 2206 | "jg 1b \n" |
michael@0 | 2207 | : [y_buf]"+r"(y_buf), // %[y_buf] |
michael@0 | 2208 | [u_buf]"+r"(u_buf), // %[u_buf] |
michael@0 | 2209 | [v_buf]"+r"(v_buf), // %[v_buf] |
michael@0 | 2210 | [dst_argb]"+r"(dst_argb), // %[dst_argb] |
michael@0 | 2211 | [width]"+rm"(width) // %[width] |
michael@0 | 2212 | : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
michael@0 | 2213 | : "memory", "cc" |
michael@0 | 2214 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 2215 | , "r14" |
michael@0 | 2216 | #endif |
michael@0 | 2217 | #if defined(__SSE2__) |
michael@0 | 2218 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 2219 | #endif |
michael@0 | 2220 | ); |
michael@0 | 2221 | } |
michael@0 | 2222 | |
michael@0 | 2223 | void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, |
michael@0 | 2224 | const uint8* u_buf, |
michael@0 | 2225 | const uint8* v_buf, |
michael@0 | 2226 | uint8* dst_rgb24, |
michael@0 | 2227 | int width) { |
michael@0 | 2228 | // fpic 32 bit gcc 4.2 on OSX runs out of GPR regs. |
michael@0 | 2229 | #if defined(__i386__) |
michael@0 | 2230 | asm volatile ( |
michael@0 | 2231 | "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" |
michael@0 | 2232 | "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" |
michael@0 | 2233 | :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), |
michael@0 | 2234 | [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)); |
michael@0 | 2235 | #endif |
michael@0 | 2236 | |
michael@0 | 2237 | asm volatile ( |
michael@0 | 2238 | #if !defined(__i386__) |
michael@0 | 2239 | "movdqa %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n" |
michael@0 | 2240 | "movdqa %[kShuffleMaskARGBToRGB24],%%xmm6 \n" |
michael@0 | 2241 | #endif |
michael@0 | 2242 | "sub %[u_buf],%[v_buf] \n" |
michael@0 | 2243 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 2244 | LABELALIGN |
michael@0 | 2245 | "1: \n" |
michael@0 | 2246 | READYUV422 |
michael@0 | 2247 | YUVTORGB |
michael@0 | 2248 | "punpcklbw %%xmm1,%%xmm0 \n" |
michael@0 | 2249 | "punpcklbw %%xmm2,%%xmm2 \n" |
michael@0 | 2250 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 2251 | "punpcklwd %%xmm2,%%xmm0 \n" |
michael@0 | 2252 | "punpckhwd %%xmm2,%%xmm1 \n" |
michael@0 | 2253 | "pshufb %%xmm5,%%xmm0 \n" |
michael@0 | 2254 | "pshufb %%xmm6,%%xmm1 \n" |
michael@0 | 2255 | "palignr $0xc,%%xmm0,%%xmm1 \n" |
michael@0 | 2256 | "movq %%xmm0," MEMACCESS([dst_rgb24]) "\n" |
michael@0 | 2257 | "movdqu %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n" |
michael@0 | 2258 | "lea " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n" |
michael@0 | 2259 | "sub $0x8,%[width] \n" |
michael@0 | 2260 | "jg 1b \n" |
michael@0 | 2261 | : [y_buf]"+r"(y_buf), // %[y_buf] |
michael@0 | 2262 | [u_buf]"+r"(u_buf), // %[u_buf] |
michael@0 | 2263 | [v_buf]"+r"(v_buf), // %[v_buf] |
michael@0 | 2264 | [dst_rgb24]"+r"(dst_rgb24), // %[dst_rgb24] |
michael@0 | 2265 | [width]"+rm"(width) // %[width] |
michael@0 | 2266 | : [kYuvConstants]"r"(&kYuvConstants.kUVToB) |
michael@0 | 2267 | #if !defined(__i386__) |
michael@0 | 2268 | , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), |
michael@0 | 2269 | [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) |
michael@0 | 2270 | #endif |
michael@0 | 2271 | : "memory", "cc" |
michael@0 | 2272 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 2273 | , "r14" |
michael@0 | 2274 | #endif |
michael@0 | 2275 | #if defined(__SSE2__) |
michael@0 | 2276 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
michael@0 | 2277 | #endif |
michael@0 | 2278 | ); |
michael@0 | 2279 | } |
michael@0 | 2280 | |
michael@0 | 2281 | void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, |
michael@0 | 2282 | const uint8* u_buf, |
michael@0 | 2283 | const uint8* v_buf, |
michael@0 | 2284 | uint8* dst_raw, |
michael@0 | 2285 | int width) { |
michael@0 | 2286 | // fpic 32 bit gcc 4.2 on OSX runs out of GPR regs. |
michael@0 | 2287 | #if defined(__i386__) |
michael@0 | 2288 | asm volatile ( |
michael@0 | 2289 | "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" |
michael@0 | 2290 | "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" |
michael@0 | 2291 | :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), |
michael@0 | 2292 | [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)); |
michael@0 | 2293 | #endif |
michael@0 | 2294 | |
michael@0 | 2295 | asm volatile ( |
michael@0 | 2296 | #if !defined(__i386__) |
michael@0 | 2297 | "movdqa %[kShuffleMaskARGBToRAW_0],%%xmm5 \n" |
michael@0 | 2298 | "movdqa %[kShuffleMaskARGBToRAW],%%xmm6 \n" |
michael@0 | 2299 | #endif |
michael@0 | 2300 | "sub %[u_buf],%[v_buf] \n" |
michael@0 | 2301 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 2302 | LABELALIGN |
michael@0 | 2303 | "1: \n" |
michael@0 | 2304 | READYUV422 |
michael@0 | 2305 | YUVTORGB |
michael@0 | 2306 | "punpcklbw %%xmm1,%%xmm0 \n" |
michael@0 | 2307 | "punpcklbw %%xmm2,%%xmm2 \n" |
michael@0 | 2308 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 2309 | "punpcklwd %%xmm2,%%xmm0 \n" |
michael@0 | 2310 | "punpckhwd %%xmm2,%%xmm1 \n" |
michael@0 | 2311 | "pshufb %%xmm5,%%xmm0 \n" |
michael@0 | 2312 | "pshufb %%xmm6,%%xmm1 \n" |
michael@0 | 2313 | "palignr $0xc,%%xmm0,%%xmm1 \n" |
michael@0 | 2314 | "movq %%xmm0," MEMACCESS([dst_raw]) " \n" |
michael@0 | 2315 | "movdqu %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n" |
michael@0 | 2316 | "lea " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n" |
michael@0 | 2317 | "sub $0x8,%[width] \n" |
michael@0 | 2318 | "jg 1b \n" |
michael@0 | 2319 | : [y_buf]"+r"(y_buf), // %[y_buf] |
michael@0 | 2320 | [u_buf]"+r"(u_buf), // %[u_buf] |
michael@0 | 2321 | [v_buf]"+r"(v_buf), // %[v_buf] |
michael@0 | 2322 | [dst_raw]"+r"(dst_raw), // %[dst_raw] |
michael@0 | 2323 | [width]"+rm"(width) // %[width] |
michael@0 | 2324 | : [kYuvConstants]"r"(&kYuvConstants.kUVToB) |
michael@0 | 2325 | #if !defined(__i386__) |
michael@0 | 2326 | , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), |
michael@0 | 2327 | [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW) |
michael@0 | 2328 | #endif |
michael@0 | 2329 | : "memory", "cc" |
michael@0 | 2330 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 2331 | , "r14" |
michael@0 | 2332 | #endif |
michael@0 | 2333 | #if defined(__SSE2__) |
michael@0 | 2334 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
michael@0 | 2335 | #endif |
michael@0 | 2336 | ); |
michael@0 | 2337 | } |
michael@0 | 2338 | |
michael@0 | 2339 | void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, |
michael@0 | 2340 | const uint8* u_buf, |
michael@0 | 2341 | const uint8* v_buf, |
michael@0 | 2342 | uint8* dst_argb, |
michael@0 | 2343 | int width) { |
michael@0 | 2344 | asm volatile ( |
michael@0 | 2345 | "sub %[u_buf],%[v_buf] \n" |
michael@0 | 2346 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2347 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 2348 | LABELALIGN |
michael@0 | 2349 | "1: \n" |
michael@0 | 2350 | READYUV422 |
michael@0 | 2351 | YUVTORGB |
michael@0 | 2352 | "punpcklbw %%xmm1,%%xmm0 \n" |
michael@0 | 2353 | "punpcklbw %%xmm5,%%xmm2 \n" |
michael@0 | 2354 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 2355 | "punpcklwd %%xmm2,%%xmm0 \n" |
michael@0 | 2356 | "punpckhwd %%xmm2,%%xmm1 \n" |
michael@0 | 2357 | "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" |
michael@0 | 2358 | "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" |
michael@0 | 2359 | "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
michael@0 | 2360 | "sub $0x8,%[width] \n" |
michael@0 | 2361 | "jg 1b \n" |
michael@0 | 2362 | : [y_buf]"+r"(y_buf), // %[y_buf] |
michael@0 | 2363 | [u_buf]"+r"(u_buf), // %[u_buf] |
michael@0 | 2364 | [v_buf]"+r"(v_buf), // %[v_buf] |
michael@0 | 2365 | [dst_argb]"+r"(dst_argb), // %[dst_argb] |
michael@0 | 2366 | [width]"+rm"(width) // %[width] |
michael@0 | 2367 | : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
michael@0 | 2368 | : "memory", "cc" |
michael@0 | 2369 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 2370 | , "r14" |
michael@0 | 2371 | #endif |
michael@0 | 2372 | #if defined(__SSE2__) |
michael@0 | 2373 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 2374 | #endif |
michael@0 | 2375 | ); |
michael@0 | 2376 | } |
michael@0 | 2377 | |
michael@0 | 2378 | void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, |
michael@0 | 2379 | const uint8* u_buf, |
michael@0 | 2380 | const uint8* v_buf, |
michael@0 | 2381 | uint8* dst_argb, |
michael@0 | 2382 | int width) { |
michael@0 | 2383 | asm volatile ( |
michael@0 | 2384 | "sub %[u_buf],%[v_buf] \n" |
michael@0 | 2385 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2386 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 2387 | LABELALIGN |
michael@0 | 2388 | "1: \n" |
michael@0 | 2389 | READYUV411 |
michael@0 | 2390 | YUVTORGB |
michael@0 | 2391 | "punpcklbw %%xmm1,%%xmm0 \n" |
michael@0 | 2392 | "punpcklbw %%xmm5,%%xmm2 \n" |
michael@0 | 2393 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 2394 | "punpcklwd %%xmm2,%%xmm0 \n" |
michael@0 | 2395 | "punpckhwd %%xmm2,%%xmm1 \n" |
michael@0 | 2396 | "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" |
michael@0 | 2397 | "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" |
michael@0 | 2398 | "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
michael@0 | 2399 | "sub $0x8,%[width] \n" |
michael@0 | 2400 | "jg 1b \n" |
michael@0 | 2401 | : [y_buf]"+r"(y_buf), // %[y_buf] |
michael@0 | 2402 | [u_buf]"+r"(u_buf), // %[u_buf] |
michael@0 | 2403 | [v_buf]"+r"(v_buf), // %[v_buf] |
michael@0 | 2404 | [dst_argb]"+r"(dst_argb), // %[dst_argb] |
michael@0 | 2405 | [width]"+rm"(width) // %[width] |
michael@0 | 2406 | : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
michael@0 | 2407 | : "memory", "cc" |
michael@0 | 2408 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 2409 | , "r14" |
michael@0 | 2410 | #endif |
michael@0 | 2411 | #if defined(__SSE2__) |
michael@0 | 2412 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 2413 | #endif |
michael@0 | 2414 | ); |
michael@0 | 2415 | } |
michael@0 | 2416 | |
michael@0 | 2417 | void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, |
michael@0 | 2418 | const uint8* uv_buf, |
michael@0 | 2419 | uint8* dst_argb, |
michael@0 | 2420 | int width) { |
michael@0 | 2421 | asm volatile ( |
michael@0 | 2422 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2423 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 2424 | LABELALIGN |
michael@0 | 2425 | "1: \n" |
michael@0 | 2426 | READNV12 |
michael@0 | 2427 | YUVTORGB |
michael@0 | 2428 | "punpcklbw %%xmm1,%%xmm0 \n" |
michael@0 | 2429 | "punpcklbw %%xmm5,%%xmm2 \n" |
michael@0 | 2430 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 2431 | "punpcklwd %%xmm2,%%xmm0 \n" |
michael@0 | 2432 | "punpckhwd %%xmm2,%%xmm1 \n" |
michael@0 | 2433 | "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" |
michael@0 | 2434 | "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" |
michael@0 | 2435 | "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
michael@0 | 2436 | "sub $0x8,%[width] \n" |
michael@0 | 2437 | "jg 1b \n" |
michael@0 | 2438 | : [y_buf]"+r"(y_buf), // %[y_buf] |
michael@0 | 2439 | [uv_buf]"+r"(uv_buf), // %[uv_buf] |
michael@0 | 2440 | [dst_argb]"+r"(dst_argb), // %[dst_argb] |
michael@0 | 2441 | [width]"+rm"(width) // %[width] |
michael@0 | 2442 | : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
michael@0 | 2443 | : "memory", "cc" |
michael@0 | 2444 | // Does not use r14. |
michael@0 | 2445 | #if defined(__SSE2__) |
michael@0 | 2446 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 2447 | #endif |
michael@0 | 2448 | ); |
michael@0 | 2449 | } |
michael@0 | 2450 | |
michael@0 | 2451 | void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, |
michael@0 | 2452 | const uint8* uv_buf, |
michael@0 | 2453 | uint8* dst_argb, |
michael@0 | 2454 | int width) { |
michael@0 | 2455 | asm volatile ( |
michael@0 | 2456 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2457 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 2458 | LABELALIGN |
michael@0 | 2459 | "1: \n" |
michael@0 | 2460 | READNV12 |
michael@0 | 2461 | YVUTORGB |
michael@0 | 2462 | "punpcklbw %%xmm1,%%xmm0 \n" |
michael@0 | 2463 | "punpcklbw %%xmm5,%%xmm2 \n" |
michael@0 | 2464 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 2465 | "punpcklwd %%xmm2,%%xmm0 \n" |
michael@0 | 2466 | "punpckhwd %%xmm2,%%xmm1 \n" |
michael@0 | 2467 | "movdqa %%xmm0," MEMACCESS([dst_argb]) "\n" |
michael@0 | 2468 | "movdqa %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" |
michael@0 | 2469 | "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
michael@0 | 2470 | "sub $0x8,%[width] \n" |
michael@0 | 2471 | "jg 1b \n" |
michael@0 | 2472 | : [y_buf]"+r"(y_buf), // %[y_buf] |
michael@0 | 2473 | [uv_buf]"+r"(uv_buf), // %[uv_buf] |
michael@0 | 2474 | [dst_argb]"+r"(dst_argb), // %[dst_argb] |
michael@0 | 2475 | [width]"+rm"(width) // %[width] |
michael@0 | 2476 | : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
michael@0 | 2477 | : "memory", "cc" |
michael@0 | 2478 | // Does not use r14. |
michael@0 | 2479 | #if defined(__SSE2__) |
michael@0 | 2480 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 2481 | #endif |
michael@0 | 2482 | ); |
michael@0 | 2483 | } |
michael@0 | 2484 | |
michael@0 | 2485 | void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
michael@0 | 2486 | const uint8* u_buf, |
michael@0 | 2487 | const uint8* v_buf, |
michael@0 | 2488 | uint8* dst_argb, |
michael@0 | 2489 | int width) { |
michael@0 | 2490 | asm volatile ( |
michael@0 | 2491 | "sub %[u_buf],%[v_buf] \n" |
michael@0 | 2492 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2493 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 2494 | LABELALIGN |
michael@0 | 2495 | "1: \n" |
michael@0 | 2496 | READYUV444 |
michael@0 | 2497 | YUVTORGB |
michael@0 | 2498 | "punpcklbw %%xmm1,%%xmm0 \n" |
michael@0 | 2499 | "punpcklbw %%xmm5,%%xmm2 \n" |
michael@0 | 2500 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 2501 | "punpcklwd %%xmm2,%%xmm0 \n" |
michael@0 | 2502 | "punpckhwd %%xmm2,%%xmm1 \n" |
michael@0 | 2503 | "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" |
michael@0 | 2504 | "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" |
michael@0 | 2505 | "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
michael@0 | 2506 | "sub $0x8,%[width] \n" |
michael@0 | 2507 | "jg 1b \n" |
michael@0 | 2508 | : [y_buf]"+r"(y_buf), // %[y_buf] |
michael@0 | 2509 | [u_buf]"+r"(u_buf), // %[u_buf] |
michael@0 | 2510 | [v_buf]"+r"(v_buf), // %[v_buf] |
michael@0 | 2511 | [dst_argb]"+r"(dst_argb), // %[dst_argb] |
michael@0 | 2512 | [width]"+rm"(width) // %[width] |
michael@0 | 2513 | : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
michael@0 | 2514 | : "memory", "cc" |
michael@0 | 2515 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 2516 | , "r14" |
michael@0 | 2517 | #endif |
michael@0 | 2518 | #if defined(__SSE2__) |
michael@0 | 2519 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 2520 | #endif |
michael@0 | 2521 | ); |
michael@0 | 2522 | } |
michael@0 | 2523 | |
michael@0 | 2524 | void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
michael@0 | 2525 | const uint8* u_buf, |
michael@0 | 2526 | const uint8* v_buf, |
michael@0 | 2527 | uint8* dst_argb, |
michael@0 | 2528 | int width) { |
michael@0 | 2529 | asm volatile ( |
michael@0 | 2530 | "sub %[u_buf],%[v_buf] \n" |
michael@0 | 2531 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2532 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 2533 | LABELALIGN |
michael@0 | 2534 | "1: \n" |
michael@0 | 2535 | READYUV422 |
michael@0 | 2536 | YUVTORGB |
michael@0 | 2537 | "punpcklbw %%xmm1,%%xmm0 \n" |
michael@0 | 2538 | "punpcklbw %%xmm5,%%xmm2 \n" |
michael@0 | 2539 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 2540 | "punpcklwd %%xmm2,%%xmm0 \n" |
michael@0 | 2541 | "punpckhwd %%xmm2,%%xmm1 \n" |
michael@0 | 2542 | "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" |
michael@0 | 2543 | "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" |
michael@0 | 2544 | "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
michael@0 | 2545 | "sub $0x8,%[width] \n" |
michael@0 | 2546 | "jg 1b \n" |
michael@0 | 2547 | : [y_buf]"+r"(y_buf), // %[y_buf] |
michael@0 | 2548 | [u_buf]"+r"(u_buf), // %[u_buf] |
michael@0 | 2549 | [v_buf]"+r"(v_buf), // %[v_buf] |
michael@0 | 2550 | [dst_argb]"+r"(dst_argb), // %[dst_argb] |
michael@0 | 2551 | [width]"+rm"(width) // %[width] |
michael@0 | 2552 | : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
michael@0 | 2553 | : "memory", "cc" |
michael@0 | 2554 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 2555 | , "r14" |
michael@0 | 2556 | #endif |
michael@0 | 2557 | #if defined(__SSE2__) |
michael@0 | 2558 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 2559 | #endif |
michael@0 | 2560 | ); |
michael@0 | 2561 | } |
michael@0 | 2562 | |
michael@0 | 2563 | void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
michael@0 | 2564 | const uint8* u_buf, |
michael@0 | 2565 | const uint8* v_buf, |
michael@0 | 2566 | uint8* dst_argb, |
michael@0 | 2567 | int width) { |
michael@0 | 2568 | asm volatile ( |
michael@0 | 2569 | "sub %[u_buf],%[v_buf] \n" |
michael@0 | 2570 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2571 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 2572 | LABELALIGN |
michael@0 | 2573 | "1: \n" |
michael@0 | 2574 | READYUV411 |
michael@0 | 2575 | YUVTORGB |
michael@0 | 2576 | "punpcklbw %%xmm1,%%xmm0 \n" |
michael@0 | 2577 | "punpcklbw %%xmm5,%%xmm2 \n" |
michael@0 | 2578 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 2579 | "punpcklwd %%xmm2,%%xmm0 \n" |
michael@0 | 2580 | "punpckhwd %%xmm2,%%xmm1 \n" |
michael@0 | 2581 | "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" |
michael@0 | 2582 | "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" |
michael@0 | 2583 | "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
michael@0 | 2584 | "sub $0x8,%[width] \n" |
michael@0 | 2585 | "jg 1b \n" |
michael@0 | 2586 | : [y_buf]"+r"(y_buf), // %[y_buf] |
michael@0 | 2587 | [u_buf]"+r"(u_buf), // %[u_buf] |
michael@0 | 2588 | [v_buf]"+r"(v_buf), // %[v_buf] |
michael@0 | 2589 | [dst_argb]"+r"(dst_argb), // %[dst_argb] |
michael@0 | 2590 | [width]"+rm"(width) // %[width] |
michael@0 | 2591 | : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
michael@0 | 2592 | : "memory", "cc" |
michael@0 | 2593 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 2594 | , "r14" |
michael@0 | 2595 | #endif |
michael@0 | 2596 | #if defined(__SSE2__) |
michael@0 | 2597 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 2598 | #endif |
michael@0 | 2599 | ); |
michael@0 | 2600 | } |
michael@0 | 2601 | |
michael@0 | 2602 | void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
michael@0 | 2603 | const uint8* uv_buf, |
michael@0 | 2604 | uint8* dst_argb, |
michael@0 | 2605 | int width) { |
michael@0 | 2606 | asm volatile ( |
michael@0 | 2607 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2608 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 2609 | LABELALIGN |
michael@0 | 2610 | "1: \n" |
michael@0 | 2611 | READNV12 |
michael@0 | 2612 | YUVTORGB |
michael@0 | 2613 | "punpcklbw %%xmm1,%%xmm0 \n" |
michael@0 | 2614 | "punpcklbw %%xmm5,%%xmm2 \n" |
michael@0 | 2615 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 2616 | "punpcklwd %%xmm2,%%xmm0 \n" |
michael@0 | 2617 | "punpckhwd %%xmm2,%%xmm1 \n" |
michael@0 | 2618 | "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" |
michael@0 | 2619 | "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" |
michael@0 | 2620 | "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
michael@0 | 2621 | "sub $0x8,%[width] \n" |
michael@0 | 2622 | "jg 1b \n" |
michael@0 | 2623 | : [y_buf]"+r"(y_buf), // %[y_buf] |
michael@0 | 2624 | [uv_buf]"+r"(uv_buf), // %[uv_buf] |
michael@0 | 2625 | [dst_argb]"+r"(dst_argb), // %[dst_argb] |
michael@0 | 2626 | [width]"+rm"(width) // %[width] |
michael@0 | 2627 | : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
michael@0 | 2628 | : "memory", "cc" |
michael@0 | 2629 | // Does not use r14. |
michael@0 | 2630 | #if defined(__SSE2__) |
michael@0 | 2631 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 2632 | #endif |
michael@0 | 2633 | ); |
michael@0 | 2634 | } |
michael@0 | 2635 | |
michael@0 | 2636 | void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, |
michael@0 | 2637 | const uint8* uv_buf, |
michael@0 | 2638 | uint8* dst_argb, |
michael@0 | 2639 | int width) { |
michael@0 | 2640 | asm volatile ( |
michael@0 | 2641 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2642 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 2643 | LABELALIGN |
michael@0 | 2644 | "1: \n" |
michael@0 | 2645 | READNV12 |
michael@0 | 2646 | YVUTORGB |
michael@0 | 2647 | "punpcklbw %%xmm1,%%xmm0 \n" |
michael@0 | 2648 | "punpcklbw %%xmm5,%%xmm2 \n" |
michael@0 | 2649 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 2650 | "punpcklwd %%xmm2,%%xmm0 \n" |
michael@0 | 2651 | "punpckhwd %%xmm2,%%xmm1 \n" |
michael@0 | 2652 | "movdqu %%xmm0," MEMACCESS([dst_argb]) "\n" |
michael@0 | 2653 | "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n" |
michael@0 | 2654 | "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" |
michael@0 | 2655 | "sub $0x8,%[width] \n" |
michael@0 | 2656 | "jg 1b \n" |
michael@0 | 2657 | : [y_buf]"+r"(y_buf), // %[y_buf] |
michael@0 | 2658 | [uv_buf]"+r"(uv_buf), // %[uv_buf] |
michael@0 | 2659 | [dst_argb]"+r"(dst_argb), // %[dst_argb] |
michael@0 | 2660 | [width]"+rm"(width) // %[width] |
michael@0 | 2661 | : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
michael@0 | 2662 | : "memory", "cc" |
michael@0 | 2663 | // Does not use r14. |
michael@0 | 2664 | #if defined(__SSE2__) |
michael@0 | 2665 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 2666 | #endif |
michael@0 | 2667 | ); |
michael@0 | 2668 | } |
michael@0 | 2669 | |
michael@0 | 2670 | void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, |
michael@0 | 2671 | const uint8* u_buf, |
michael@0 | 2672 | const uint8* v_buf, |
michael@0 | 2673 | uint8* dst_bgra, |
michael@0 | 2674 | int width) { |
michael@0 | 2675 | asm volatile ( |
michael@0 | 2676 | "sub %[u_buf],%[v_buf] \n" |
michael@0 | 2677 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2678 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 2679 | LABELALIGN |
michael@0 | 2680 | "1: \n" |
michael@0 | 2681 | READYUV422 |
michael@0 | 2682 | YUVTORGB |
michael@0 | 2683 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2684 | "punpcklbw %%xmm0,%%xmm1 \n" |
michael@0 | 2685 | "punpcklbw %%xmm2,%%xmm5 \n" |
michael@0 | 2686 | "movdqa %%xmm5,%%xmm0 \n" |
michael@0 | 2687 | "punpcklwd %%xmm1,%%xmm5 \n" |
michael@0 | 2688 | "punpckhwd %%xmm1,%%xmm0 \n" |
michael@0 | 2689 | "movdqa %%xmm5," MEMACCESS([dst_bgra]) "\n" |
michael@0 | 2690 | "movdqa %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" |
michael@0 | 2691 | "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" |
michael@0 | 2692 | "sub $0x8,%[width] \n" |
michael@0 | 2693 | "jg 1b \n" |
michael@0 | 2694 | : [y_buf]"+r"(y_buf), // %[y_buf] |
michael@0 | 2695 | [u_buf]"+r"(u_buf), // %[u_buf] |
michael@0 | 2696 | [v_buf]"+r"(v_buf), // %[v_buf] |
michael@0 | 2697 | [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] |
michael@0 | 2698 | [width]"+rm"(width) // %[width] |
michael@0 | 2699 | : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
michael@0 | 2700 | : "memory", "cc" |
michael@0 | 2701 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 2702 | , "r14" |
michael@0 | 2703 | #endif |
michael@0 | 2704 | #if defined(__SSE2__) |
michael@0 | 2705 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 2706 | #endif |
michael@0 | 2707 | ); |
michael@0 | 2708 | } |
michael@0 | 2709 | |
michael@0 | 2710 | void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, |
michael@0 | 2711 | const uint8* u_buf, |
michael@0 | 2712 | const uint8* v_buf, |
michael@0 | 2713 | uint8* dst_abgr, |
michael@0 | 2714 | int width) { |
michael@0 | 2715 | asm volatile ( |
michael@0 | 2716 | "sub %[u_buf],%[v_buf] \n" |
michael@0 | 2717 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2718 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 2719 | LABELALIGN |
michael@0 | 2720 | "1: \n" |
michael@0 | 2721 | READYUV422 |
michael@0 | 2722 | YUVTORGB |
michael@0 | 2723 | "punpcklbw %%xmm1,%%xmm2 \n" |
michael@0 | 2724 | "punpcklbw %%xmm5,%%xmm0 \n" |
michael@0 | 2725 | "movdqa %%xmm2,%%xmm1 \n" |
michael@0 | 2726 | "punpcklwd %%xmm0,%%xmm2 \n" |
michael@0 | 2727 | "punpckhwd %%xmm0,%%xmm1 \n" |
michael@0 | 2728 | "movdqa %%xmm2," MEMACCESS([dst_abgr]) "\n" |
michael@0 | 2729 | "movdqa %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" |
michael@0 | 2730 | "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" |
michael@0 | 2731 | "sub $0x8,%[width] \n" |
michael@0 | 2732 | "jg 1b \n" |
michael@0 | 2733 | : [y_buf]"+r"(y_buf), // %[y_buf] |
michael@0 | 2734 | [u_buf]"+r"(u_buf), // %[u_buf] |
michael@0 | 2735 | [v_buf]"+r"(v_buf), // %[v_buf] |
michael@0 | 2736 | [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
michael@0 | 2737 | [width]"+rm"(width) // %[width] |
michael@0 | 2738 | : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
michael@0 | 2739 | : "memory", "cc" |
michael@0 | 2740 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 2741 | , "r14" |
michael@0 | 2742 | #endif |
michael@0 | 2743 | #if defined(__SSE2__) |
michael@0 | 2744 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 2745 | #endif |
michael@0 | 2746 | ); |
michael@0 | 2747 | } |
michael@0 | 2748 | |
michael@0 | 2749 | void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, |
michael@0 | 2750 | const uint8* u_buf, |
michael@0 | 2751 | const uint8* v_buf, |
michael@0 | 2752 | uint8* dst_rgba, |
michael@0 | 2753 | int width) { |
michael@0 | 2754 | asm volatile ( |
michael@0 | 2755 | "sub %[u_buf],%[v_buf] \n" |
michael@0 | 2756 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2757 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 2758 | LABELALIGN |
michael@0 | 2759 | "1: \n" |
michael@0 | 2760 | READYUV422 |
michael@0 | 2761 | YUVTORGB |
michael@0 | 2762 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2763 | "punpcklbw %%xmm2,%%xmm1 \n" |
michael@0 | 2764 | "punpcklbw %%xmm0,%%xmm5 \n" |
michael@0 | 2765 | "movdqa %%xmm5,%%xmm0 \n" |
michael@0 | 2766 | "punpcklwd %%xmm1,%%xmm5 \n" |
michael@0 | 2767 | "punpckhwd %%xmm1,%%xmm0 \n" |
michael@0 | 2768 | "movdqa %%xmm5," MEMACCESS([dst_rgba]) "\n" |
michael@0 | 2769 | "movdqa %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" |
michael@0 | 2770 | "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" |
michael@0 | 2771 | "sub $0x8,%[width] \n" |
michael@0 | 2772 | "jg 1b \n" |
michael@0 | 2773 | : [y_buf]"+r"(y_buf), // %[y_buf] |
michael@0 | 2774 | [u_buf]"+r"(u_buf), // %[u_buf] |
michael@0 | 2775 | [v_buf]"+r"(v_buf), // %[v_buf] |
michael@0 | 2776 | [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] |
michael@0 | 2777 | [width]"+rm"(width) // %[width] |
michael@0 | 2778 | : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
michael@0 | 2779 | : "memory", "cc" |
michael@0 | 2780 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 2781 | , "r14" |
michael@0 | 2782 | #endif |
michael@0 | 2783 | #if defined(__SSE2__) |
michael@0 | 2784 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 2785 | #endif |
michael@0 | 2786 | ); |
michael@0 | 2787 | } |
michael@0 | 2788 | |
michael@0 | 2789 | void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, |
michael@0 | 2790 | const uint8* u_buf, |
michael@0 | 2791 | const uint8* v_buf, |
michael@0 | 2792 | uint8* dst_bgra, |
michael@0 | 2793 | int width) { |
michael@0 | 2794 | asm volatile ( |
michael@0 | 2795 | "sub %[u_buf],%[v_buf] \n" |
michael@0 | 2796 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2797 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 2798 | LABELALIGN |
michael@0 | 2799 | "1: \n" |
michael@0 | 2800 | READYUV422 |
michael@0 | 2801 | YUVTORGB |
michael@0 | 2802 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2803 | "punpcklbw %%xmm0,%%xmm1 \n" |
michael@0 | 2804 | "punpcklbw %%xmm2,%%xmm5 \n" |
michael@0 | 2805 | "movdqa %%xmm5,%%xmm0 \n" |
michael@0 | 2806 | "punpcklwd %%xmm1,%%xmm5 \n" |
michael@0 | 2807 | "punpckhwd %%xmm1,%%xmm0 \n" |
michael@0 | 2808 | "movdqu %%xmm5," MEMACCESS([dst_bgra]) "\n" |
michael@0 | 2809 | "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n" |
michael@0 | 2810 | "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" |
michael@0 | 2811 | "sub $0x8,%[width] \n" |
michael@0 | 2812 | "jg 1b \n" |
michael@0 | 2813 | : [y_buf]"+r"(y_buf), // %[y_buf] |
michael@0 | 2814 | [u_buf]"+r"(u_buf), // %[u_buf] |
michael@0 | 2815 | [v_buf]"+r"(v_buf), // %[v_buf] |
michael@0 | 2816 | [dst_bgra]"+r"(dst_bgra), // %[dst_bgra] |
michael@0 | 2817 | [width]"+rm"(width) // %[width] |
michael@0 | 2818 | : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
michael@0 | 2819 | : "memory", "cc" |
michael@0 | 2820 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 2821 | , "r14" |
michael@0 | 2822 | #endif |
michael@0 | 2823 | #if defined(__SSE2__) |
michael@0 | 2824 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 2825 | #endif |
michael@0 | 2826 | ); |
michael@0 | 2827 | } |
michael@0 | 2828 | |
michael@0 | 2829 | void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, |
michael@0 | 2830 | const uint8* u_buf, |
michael@0 | 2831 | const uint8* v_buf, |
michael@0 | 2832 | uint8* dst_abgr, |
michael@0 | 2833 | int width) { |
michael@0 | 2834 | asm volatile ( |
michael@0 | 2835 | "sub %[u_buf],%[v_buf] \n" |
michael@0 | 2836 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2837 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 2838 | LABELALIGN |
michael@0 | 2839 | "1: \n" |
michael@0 | 2840 | READYUV422 |
michael@0 | 2841 | YUVTORGB |
michael@0 | 2842 | "punpcklbw %%xmm1,%%xmm2 \n" |
michael@0 | 2843 | "punpcklbw %%xmm5,%%xmm0 \n" |
michael@0 | 2844 | "movdqa %%xmm2,%%xmm1 \n" |
michael@0 | 2845 | "punpcklwd %%xmm0,%%xmm2 \n" |
michael@0 | 2846 | "punpckhwd %%xmm0,%%xmm1 \n" |
michael@0 | 2847 | "movdqu %%xmm2," MEMACCESS([dst_abgr]) "\n" |
michael@0 | 2848 | "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n" |
michael@0 | 2849 | "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" |
michael@0 | 2850 | "sub $0x8,%[width] \n" |
michael@0 | 2851 | "jg 1b \n" |
michael@0 | 2852 | : [y_buf]"+r"(y_buf), // %[y_buf] |
michael@0 | 2853 | [u_buf]"+r"(u_buf), // %[u_buf] |
michael@0 | 2854 | [v_buf]"+r"(v_buf), // %[v_buf] |
michael@0 | 2855 | [dst_abgr]"+r"(dst_abgr), // %[dst_abgr] |
michael@0 | 2856 | [width]"+rm"(width) // %[width] |
michael@0 | 2857 | : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
michael@0 | 2858 | : "memory", "cc" |
michael@0 | 2859 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 2860 | , "r14" |
michael@0 | 2861 | #endif |
michael@0 | 2862 | #if defined(__SSE2__) |
michael@0 | 2863 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 2864 | #endif |
michael@0 | 2865 | ); |
michael@0 | 2866 | } |
michael@0 | 2867 | |
michael@0 | 2868 | void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, |
michael@0 | 2869 | const uint8* u_buf, |
michael@0 | 2870 | const uint8* v_buf, |
michael@0 | 2871 | uint8* dst_rgba, |
michael@0 | 2872 | int width) { |
michael@0 | 2873 | asm volatile ( |
michael@0 | 2874 | "sub %[u_buf],%[v_buf] \n" |
michael@0 | 2875 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2876 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 2877 | LABELALIGN |
michael@0 | 2878 | "1: \n" |
michael@0 | 2879 | READYUV422 |
michael@0 | 2880 | YUVTORGB |
michael@0 | 2881 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 2882 | "punpcklbw %%xmm2,%%xmm1 \n" |
michael@0 | 2883 | "punpcklbw %%xmm0,%%xmm5 \n" |
michael@0 | 2884 | "movdqa %%xmm5,%%xmm0 \n" |
michael@0 | 2885 | "punpcklwd %%xmm1,%%xmm5 \n" |
michael@0 | 2886 | "punpckhwd %%xmm1,%%xmm0 \n" |
michael@0 | 2887 | "movdqu %%xmm5," MEMACCESS([dst_rgba]) "\n" |
michael@0 | 2888 | "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n" |
michael@0 | 2889 | "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" |
michael@0 | 2890 | "sub $0x8,%[width] \n" |
michael@0 | 2891 | "jg 1b \n" |
michael@0 | 2892 | : [y_buf]"+r"(y_buf), // %[y_buf] |
michael@0 | 2893 | [u_buf]"+r"(u_buf), // %[u_buf] |
michael@0 | 2894 | [v_buf]"+r"(v_buf), // %[v_buf] |
michael@0 | 2895 | [dst_rgba]"+r"(dst_rgba), // %[dst_rgba] |
michael@0 | 2896 | [width]"+rm"(width) // %[width] |
michael@0 | 2897 | : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] |
michael@0 | 2898 | : "memory", "cc" |
michael@0 | 2899 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 2900 | , "r14" |
michael@0 | 2901 | #endif |
michael@0 | 2902 | #if defined(__SSE2__) |
michael@0 | 2903 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 2904 | #endif |
michael@0 | 2905 | ); |
michael@0 | 2906 | } |
michael@0 | 2907 | |
michael@0 | 2908 | #endif // HAS_I422TOARGBROW_SSSE3 |
michael@0 | 2909 | |
michael@0 | 2910 | #ifdef HAS_YTOARGBROW_SSE2 |
michael@0 | 2911 | void YToARGBRow_SSE2(const uint8* y_buf, |
michael@0 | 2912 | uint8* dst_argb, |
michael@0 | 2913 | int width) { |
michael@0 | 2914 | asm volatile ( |
michael@0 | 2915 | "pxor %%xmm5,%%xmm5 \n" |
michael@0 | 2916 | "pcmpeqb %%xmm4,%%xmm4 \n" |
michael@0 | 2917 | "pslld $0x18,%%xmm4 \n" |
michael@0 | 2918 | "mov $0x00100010,%%eax \n" |
michael@0 | 2919 | "movd %%eax,%%xmm3 \n" |
michael@0 | 2920 | "pshufd $0x0,%%xmm3,%%xmm3 \n" |
michael@0 | 2921 | "mov $0x004a004a,%%eax \n" |
michael@0 | 2922 | "movd %%eax,%%xmm2 \n" |
michael@0 | 2923 | "pshufd $0x0,%%xmm2,%%xmm2 \n" |
michael@0 | 2924 | LABELALIGN |
michael@0 | 2925 | "1: \n" |
michael@0 | 2926 | // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 |
michael@0 | 2927 | "movq " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 2928 | "lea " MEMLEA(0x8,0) ",%0 \n" |
michael@0 | 2929 | "punpcklbw %%xmm5,%%xmm0 \n" |
michael@0 | 2930 | "psubusw %%xmm3,%%xmm0 \n" |
michael@0 | 2931 | "pmullw %%xmm2,%%xmm0 \n" |
michael@0 | 2932 | "psrlw $6, %%xmm0 \n" |
michael@0 | 2933 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 2934 | |
michael@0 | 2935 | // Step 2: Weave into ARGB |
michael@0 | 2936 | "punpcklbw %%xmm0,%%xmm0 \n" |
michael@0 | 2937 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 2938 | "punpcklwd %%xmm0,%%xmm0 \n" |
michael@0 | 2939 | "punpckhwd %%xmm1,%%xmm1 \n" |
michael@0 | 2940 | "por %%xmm4,%%xmm0 \n" |
michael@0 | 2941 | "por %%xmm4,%%xmm1 \n" |
michael@0 | 2942 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 2943 | "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" |
michael@0 | 2944 | "lea " MEMLEA(0x20,1) ",%1 \n" |
michael@0 | 2945 | |
michael@0 | 2946 | "sub $0x8,%2 \n" |
michael@0 | 2947 | "jg 1b \n" |
michael@0 | 2948 | : "+r"(y_buf), // %0 |
michael@0 | 2949 | "+r"(dst_argb), // %1 |
michael@0 | 2950 | "+rm"(width) // %2 |
michael@0 | 2951 | : |
michael@0 | 2952 | : "memory", "cc", "eax" |
michael@0 | 2953 | #if defined(__SSE2__) |
michael@0 | 2954 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
michael@0 | 2955 | #endif |
michael@0 | 2956 | ); |
michael@0 | 2957 | } |
michael@0 | 2958 | #endif // HAS_YTOARGBROW_SSE2 |
michael@0 | 2959 | |
michael@0 | 2960 | #ifdef HAS_MIRRORROW_SSSE3 |
michael@0 | 2961 | // Shuffle table for reversing the bytes. |
michael@0 | 2962 | static uvec8 kShuffleMirror = { |
michael@0 | 2963 | 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u |
michael@0 | 2964 | }; |
michael@0 | 2965 | |
michael@0 | 2966 | void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { |
michael@0 | 2967 | intptr_t temp_width = (intptr_t)(width); |
michael@0 | 2968 | asm volatile ( |
michael@0 | 2969 | "movdqa %3,%%xmm5 \n" |
michael@0 | 2970 | "lea " MEMLEA(-0x10,0) ",%0 \n" |
michael@0 | 2971 | LABELALIGN |
michael@0 | 2972 | "1: \n" |
michael@0 | 2973 | MEMOPREG(movdqa,0x00,0,2,1,xmm0) // movdqa (%0,%2),%%xmm0 |
michael@0 | 2974 | "pshufb %%xmm5,%%xmm0 \n" |
michael@0 | 2975 | "sub $0x10,%2 \n" |
michael@0 | 2976 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 2977 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 2978 | "jg 1b \n" |
michael@0 | 2979 | : "+r"(src), // %0 |
michael@0 | 2980 | "+r"(dst), // %1 |
michael@0 | 2981 | "+r"(temp_width) // %2 |
michael@0 | 2982 | : "m"(kShuffleMirror) // %3 |
michael@0 | 2983 | : "memory", "cc" |
michael@0 | 2984 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 2985 | , "r14" |
michael@0 | 2986 | #endif |
michael@0 | 2987 | #if defined(__SSE2__) |
michael@0 | 2988 | , "xmm0", "xmm5" |
michael@0 | 2989 | #endif |
michael@0 | 2990 | ); |
michael@0 | 2991 | } |
michael@0 | 2992 | #endif // HAS_MIRRORROW_SSSE3 |
michael@0 | 2993 | |
michael@0 | 2994 | #ifdef HAS_MIRRORROW_SSE2 |
michael@0 | 2995 | void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { |
michael@0 | 2996 | intptr_t temp_width = (intptr_t)(width); |
michael@0 | 2997 | asm volatile ( |
michael@0 | 2998 | "lea " MEMLEA(-0x10,0) ",%0 \n" |
michael@0 | 2999 | LABELALIGN |
michael@0 | 3000 | "1: \n" |
michael@0 | 3001 | MEMOPREG(movdqu,0x00,0,2,1,xmm0) // movdqu (%0,%2),%%xmm0 |
michael@0 | 3002 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 3003 | "psllw $0x8,%%xmm0 \n" |
michael@0 | 3004 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 3005 | "por %%xmm1,%%xmm0 \n" |
michael@0 | 3006 | "pshuflw $0x1b,%%xmm0,%%xmm0 \n" |
michael@0 | 3007 | "pshufhw $0x1b,%%xmm0,%%xmm0 \n" |
michael@0 | 3008 | "pshufd $0x4e,%%xmm0,%%xmm0 \n" |
michael@0 | 3009 | "sub $0x10,%2 \n" |
michael@0 | 3010 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 3011 | "lea " MEMLEA(0x10,1)",%1 \n" |
michael@0 | 3012 | "jg 1b \n" |
michael@0 | 3013 | : "+r"(src), // %0 |
michael@0 | 3014 | "+r"(dst), // %1 |
michael@0 | 3015 | "+r"(temp_width) // %2 |
michael@0 | 3016 | : |
michael@0 | 3017 | : "memory", "cc" |
michael@0 | 3018 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 3019 | , "r14" |
michael@0 | 3020 | #endif |
michael@0 | 3021 | #if defined(__SSE2__) |
michael@0 | 3022 | , "xmm0", "xmm1" |
michael@0 | 3023 | #endif |
michael@0 | 3024 | ); |
michael@0 | 3025 | } |
michael@0 | 3026 | #endif // HAS_MIRRORROW_SSE2 |
michael@0 | 3027 | |
michael@0 | 3028 | #ifdef HAS_MIRRORROW_UV_SSSE3 |
michael@0 | 3029 | // Shuffle table for reversing the bytes of UV channels. |
michael@0 | 3030 | static uvec8 kShuffleMirrorUV = { |
michael@0 | 3031 | 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u |
michael@0 | 3032 | }; |
michael@0 | 3033 | void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, |
michael@0 | 3034 | int width) { |
michael@0 | 3035 | intptr_t temp_width = (intptr_t)(width); |
michael@0 | 3036 | asm volatile ( |
michael@0 | 3037 | "movdqa %4,%%xmm1 \n" |
michael@0 | 3038 | "lea " MEMLEA4(-0x10,0,3,2) ",%0 \n" |
michael@0 | 3039 | "sub %1,%2 \n" |
michael@0 | 3040 | LABELALIGN |
michael@0 | 3041 | "1: \n" |
michael@0 | 3042 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3043 | "lea " MEMLEA(-0x10,0) ",%0 \n" |
michael@0 | 3044 | "pshufb %%xmm1,%%xmm0 \n" |
michael@0 | 3045 | "sub $8,%3 \n" |
michael@0 | 3046 | "movlpd %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 3047 | BUNDLEALIGN |
michael@0 | 3048 | MEMOPMEM(movhpd,xmm0,0x00,1,2,1) // movhpd %%xmm0,(%1,%2) |
michael@0 | 3049 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 3050 | "jg 1b \n" |
michael@0 | 3051 | : "+r"(src), // %0 |
michael@0 | 3052 | "+r"(dst_u), // %1 |
michael@0 | 3053 | "+r"(dst_v), // %2 |
michael@0 | 3054 | "+r"(temp_width) // %3 |
michael@0 | 3055 | : "m"(kShuffleMirrorUV) // %4 |
michael@0 | 3056 | : "memory", "cc" |
michael@0 | 3057 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 3058 | , "r14" |
michael@0 | 3059 | #endif |
michael@0 | 3060 | #if defined(__SSE2__) |
michael@0 | 3061 | , "xmm0", "xmm1" |
michael@0 | 3062 | #endif |
michael@0 | 3063 | ); |
michael@0 | 3064 | } |
michael@0 | 3065 | #endif // HAS_MIRRORROW_UV_SSSE3 |
michael@0 | 3066 | |
michael@0 | 3067 | #ifdef HAS_ARGBMIRRORROW_SSSE3 |
michael@0 | 3068 | // Shuffle table for reversing the bytes. |
michael@0 | 3069 | static uvec8 kARGBShuffleMirror = { |
michael@0 | 3070 | 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u |
michael@0 | 3071 | }; |
michael@0 | 3072 | |
michael@0 | 3073 | void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { |
michael@0 | 3074 | intptr_t temp_width = (intptr_t)(width); |
michael@0 | 3075 | asm volatile ( |
michael@0 | 3076 | "lea " MEMLEA4(-0x10,0,2,4) ",%0 \n" |
michael@0 | 3077 | "movdqa %3,%%xmm5 \n" |
michael@0 | 3078 | LABELALIGN |
michael@0 | 3079 | "1: \n" |
michael@0 | 3080 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3081 | "pshufb %%xmm5,%%xmm0 \n" |
michael@0 | 3082 | "lea " MEMLEA(-0x10,0) ",%0 \n" |
michael@0 | 3083 | "sub $0x4,%2 \n" |
michael@0 | 3084 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 3085 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 3086 | "jg 1b \n" |
michael@0 | 3087 | : "+r"(src), // %0 |
michael@0 | 3088 | "+r"(dst), // %1 |
michael@0 | 3089 | "+r"(temp_width) // %2 |
michael@0 | 3090 | : "m"(kARGBShuffleMirror) // %3 |
michael@0 | 3091 | : "memory", "cc" |
michael@0 | 3092 | #if defined(__SSE2__) |
michael@0 | 3093 | , "xmm0", "xmm5" |
michael@0 | 3094 | #endif |
michael@0 | 3095 | ); |
michael@0 | 3096 | } |
michael@0 | 3097 | #endif // HAS_ARGBMIRRORROW_SSSE3 |
michael@0 | 3098 | |
michael@0 | 3099 | #ifdef HAS_SPLITUVROW_SSE2 |
michael@0 | 3100 | void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 3101 | asm volatile ( |
michael@0 | 3102 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 3103 | "psrlw $0x8,%%xmm5 \n" |
michael@0 | 3104 | "sub %1,%2 \n" |
michael@0 | 3105 | LABELALIGN |
michael@0 | 3106 | "1: \n" |
michael@0 | 3107 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3108 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 3109 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 3110 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 3111 | "movdqa %%xmm1,%%xmm3 \n" |
michael@0 | 3112 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 3113 | "pand %%xmm5,%%xmm1 \n" |
michael@0 | 3114 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 3115 | "psrlw $0x8,%%xmm2 \n" |
michael@0 | 3116 | "psrlw $0x8,%%xmm3 \n" |
michael@0 | 3117 | "packuswb %%xmm3,%%xmm2 \n" |
michael@0 | 3118 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 3119 | MEMOPMEM(movdqa,xmm2,0x00,1,2,1) // movdqa %%xmm2,(%1,%2) |
michael@0 | 3120 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 3121 | "sub $0x10,%3 \n" |
michael@0 | 3122 | "jg 1b \n" |
michael@0 | 3123 | : "+r"(src_uv), // %0 |
michael@0 | 3124 | "+r"(dst_u), // %1 |
michael@0 | 3125 | "+r"(dst_v), // %2 |
michael@0 | 3126 | "+r"(pix) // %3 |
michael@0 | 3127 | : |
michael@0 | 3128 | : "memory", "cc" |
michael@0 | 3129 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 3130 | , "r14" |
michael@0 | 3131 | #endif |
michael@0 | 3132 | #if defined(__SSE2__) |
michael@0 | 3133 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
michael@0 | 3134 | #endif |
michael@0 | 3135 | ); |
michael@0 | 3136 | } |
michael@0 | 3137 | |
michael@0 | 3138 | void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, |
michael@0 | 3139 | int pix) { |
michael@0 | 3140 | asm volatile ( |
michael@0 | 3141 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 3142 | "psrlw $0x8,%%xmm5 \n" |
michael@0 | 3143 | "sub %1,%2 \n" |
michael@0 | 3144 | LABELALIGN |
michael@0 | 3145 | "1: \n" |
michael@0 | 3146 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3147 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 3148 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 3149 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 3150 | "movdqa %%xmm1,%%xmm3 \n" |
michael@0 | 3151 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 3152 | "pand %%xmm5,%%xmm1 \n" |
michael@0 | 3153 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 3154 | "psrlw $0x8,%%xmm2 \n" |
michael@0 | 3155 | "psrlw $0x8,%%xmm3 \n" |
michael@0 | 3156 | "packuswb %%xmm3,%%xmm2 \n" |
michael@0 | 3157 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 3158 | MEMOPMEM(movdqu,xmm2,0x00,1,2,1) // movdqu %%xmm2,(%1,%2) |
michael@0 | 3159 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 3160 | "sub $0x10,%3 \n" |
michael@0 | 3161 | "jg 1b \n" |
michael@0 | 3162 | : "+r"(src_uv), // %0 |
michael@0 | 3163 | "+r"(dst_u), // %1 |
michael@0 | 3164 | "+r"(dst_v), // %2 |
michael@0 | 3165 | "+r"(pix) // %3 |
michael@0 | 3166 | : |
michael@0 | 3167 | : "memory", "cc" |
michael@0 | 3168 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 3169 | , "r14" |
michael@0 | 3170 | #endif |
michael@0 | 3171 | #if defined(__SSE2__) |
michael@0 | 3172 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
michael@0 | 3173 | #endif |
michael@0 | 3174 | ); |
michael@0 | 3175 | } |
michael@0 | 3176 | #endif // HAS_SPLITUVROW_SSE2 |
michael@0 | 3177 | |
michael@0 | 3178 | #ifdef HAS_MERGEUVROW_SSE2 |
michael@0 | 3179 | void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
michael@0 | 3180 | int width) { |
michael@0 | 3181 | asm volatile ( |
michael@0 | 3182 | "sub %0,%1 \n" |
michael@0 | 3183 | LABELALIGN |
michael@0 | 3184 | "1: \n" |
michael@0 | 3185 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3186 | MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 |
michael@0 | 3187 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 3188 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 3189 | "punpcklbw %%xmm1,%%xmm0 \n" |
michael@0 | 3190 | "punpckhbw %%xmm1,%%xmm2 \n" |
michael@0 | 3191 | "movdqa %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 3192 | "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" |
michael@0 | 3193 | "lea " MEMLEA(0x20,2) ",%2 \n" |
michael@0 | 3194 | "sub $0x10,%3 \n" |
michael@0 | 3195 | "jg 1b \n" |
michael@0 | 3196 | : "+r"(src_u), // %0 |
michael@0 | 3197 | "+r"(src_v), // %1 |
michael@0 | 3198 | "+r"(dst_uv), // %2 |
michael@0 | 3199 | "+r"(width) // %3 |
michael@0 | 3200 | : |
michael@0 | 3201 | : "memory", "cc" |
michael@0 | 3202 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 3203 | , "r14" |
michael@0 | 3204 | #endif |
michael@0 | 3205 | #if defined(__SSE2__) |
michael@0 | 3206 | , "xmm0", "xmm1", "xmm2" |
michael@0 | 3207 | #endif |
michael@0 | 3208 | ); |
michael@0 | 3209 | } |
michael@0 | 3210 | |
michael@0 | 3211 | void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v, |
michael@0 | 3212 | uint8* dst_uv, int width) { |
michael@0 | 3213 | asm volatile ( |
michael@0 | 3214 | "sub %0,%1 \n" |
michael@0 | 3215 | LABELALIGN |
michael@0 | 3216 | "1: \n" |
michael@0 | 3217 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3218 | MEMOPREG(movdqu,0x00,0,1,1,xmm1) // movdqu (%0,%1,1),%%xmm1 |
michael@0 | 3219 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 3220 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 3221 | "punpcklbw %%xmm1,%%xmm0 \n" |
michael@0 | 3222 | "punpckhbw %%xmm1,%%xmm2 \n" |
michael@0 | 3223 | "movdqu %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 3224 | "movdqu %%xmm2," MEMACCESS2(0x10,2) " \n" |
michael@0 | 3225 | "lea " MEMLEA(0x20,2) ",%2 \n" |
michael@0 | 3226 | "sub $0x10,%3 \n" |
michael@0 | 3227 | "jg 1b \n" |
michael@0 | 3228 | : "+r"(src_u), // %0 |
michael@0 | 3229 | "+r"(src_v), // %1 |
michael@0 | 3230 | "+r"(dst_uv), // %2 |
michael@0 | 3231 | "+r"(width) // %3 |
michael@0 | 3232 | : |
michael@0 | 3233 | : "memory", "cc" |
michael@0 | 3234 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 3235 | , "r14" |
michael@0 | 3236 | #endif |
michael@0 | 3237 | #if defined(__SSE2__) |
michael@0 | 3238 | , "xmm0", "xmm1", "xmm2" |
michael@0 | 3239 | #endif |
michael@0 | 3240 | ); |
michael@0 | 3241 | } |
michael@0 | 3242 | #endif // HAS_MERGEUVROW_SSE2 |
michael@0 | 3243 | |
michael@0 | 3244 | #ifdef HAS_COPYROW_SSE2 |
michael@0 | 3245 | void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { |
michael@0 | 3246 | asm volatile ( |
michael@0 | 3247 | LABELALIGN |
michael@0 | 3248 | "1: \n" |
michael@0 | 3249 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3250 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 3251 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 3252 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 3253 | "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" |
michael@0 | 3254 | "lea " MEMLEA(0x20,1) ",%1 \n" |
michael@0 | 3255 | "sub $0x20,%2 \n" |
michael@0 | 3256 | "jg 1b \n" |
michael@0 | 3257 | : "+r"(src), // %0 |
michael@0 | 3258 | "+r"(dst), // %1 |
michael@0 | 3259 | "+r"(count) // %2 |
michael@0 | 3260 | : |
michael@0 | 3261 | : "memory", "cc" |
michael@0 | 3262 | #if defined(__SSE2__) |
michael@0 | 3263 | , "xmm0", "xmm1" |
michael@0 | 3264 | #endif |
michael@0 | 3265 | ); |
michael@0 | 3266 | } |
michael@0 | 3267 | #endif // HAS_COPYROW_SSE2 |
michael@0 | 3268 | |
michael@0 | 3269 | #ifdef HAS_COPYROW_X86 |
michael@0 | 3270 | void CopyRow_X86(const uint8* src, uint8* dst, int width) { |
michael@0 | 3271 | size_t width_tmp = (size_t)(width); |
michael@0 | 3272 | asm volatile ( |
michael@0 | 3273 | "shr $0x2,%2 \n" |
michael@0 | 3274 | "rep movsl " MEMMOVESTRING(0,1) " \n" |
michael@0 | 3275 | : "+S"(src), // %0 |
michael@0 | 3276 | "+D"(dst), // %1 |
michael@0 | 3277 | "+c"(width_tmp) // %2 |
michael@0 | 3278 | : |
michael@0 | 3279 | : "memory", "cc" |
michael@0 | 3280 | ); |
michael@0 | 3281 | } |
michael@0 | 3282 | #endif // HAS_COPYROW_X86 |
michael@0 | 3283 | |
michael@0 | 3284 | #ifdef HAS_COPYROW_ERMS |
michael@0 | 3285 | // Unaligned Multiple of 1. |
michael@0 | 3286 | void CopyRow_ERMS(const uint8* src, uint8* dst, int width) { |
michael@0 | 3287 | size_t width_tmp = (size_t)(width); |
michael@0 | 3288 | asm volatile ( |
michael@0 | 3289 | "rep movsb " MEMMOVESTRING(0,1) " \n" |
michael@0 | 3290 | : "+S"(src), // %0 |
michael@0 | 3291 | "+D"(dst), // %1 |
michael@0 | 3292 | "+c"(width_tmp) // %2 |
michael@0 | 3293 | : |
michael@0 | 3294 | : "memory", "cc" |
michael@0 | 3295 | ); |
michael@0 | 3296 | } |
michael@0 | 3297 | #endif // HAS_COPYROW_ERMS |
michael@0 | 3298 | |
michael@0 | 3299 | #ifdef HAS_ARGBCOPYALPHAROW_SSE2 |
michael@0 | 3300 | // width in pixels |
michael@0 | 3301 | void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { |
michael@0 | 3302 | asm volatile ( |
michael@0 | 3303 | "pcmpeqb %%xmm0,%%xmm0 \n" |
michael@0 | 3304 | "pslld $0x18,%%xmm0 \n" |
michael@0 | 3305 | "pcmpeqb %%xmm1,%%xmm1 \n" |
michael@0 | 3306 | "psrld $0x8,%%xmm1 \n" |
michael@0 | 3307 | LABELALIGN |
michael@0 | 3308 | "1: \n" |
michael@0 | 3309 | "movdqa " MEMACCESS(0) ",%%xmm2 \n" |
michael@0 | 3310 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" |
michael@0 | 3311 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 3312 | "movdqa " MEMACCESS(1) ",%%xmm4 \n" |
michael@0 | 3313 | "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" |
michael@0 | 3314 | "pand %%xmm0,%%xmm2 \n" |
michael@0 | 3315 | "pand %%xmm0,%%xmm3 \n" |
michael@0 | 3316 | "pand %%xmm1,%%xmm4 \n" |
michael@0 | 3317 | "pand %%xmm1,%%xmm5 \n" |
michael@0 | 3318 | "por %%xmm4,%%xmm2 \n" |
michael@0 | 3319 | "por %%xmm5,%%xmm3 \n" |
michael@0 | 3320 | "movdqa %%xmm2," MEMACCESS(1) " \n" |
michael@0 | 3321 | "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" |
michael@0 | 3322 | "lea " MEMLEA(0x20,1) ",%1 \n" |
michael@0 | 3323 | "sub $0x8,%2 \n" |
michael@0 | 3324 | "jg 1b \n" |
michael@0 | 3325 | : "+r"(src), // %0 |
michael@0 | 3326 | "+r"(dst), // %1 |
michael@0 | 3327 | "+r"(width) // %2 |
michael@0 | 3328 | : |
michael@0 | 3329 | : "memory", "cc" |
michael@0 | 3330 | #if defined(__SSE2__) |
michael@0 | 3331 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 3332 | #endif |
michael@0 | 3333 | ); |
michael@0 | 3334 | } |
michael@0 | 3335 | #endif // HAS_ARGBCOPYALPHAROW_SSE2 |
michael@0 | 3336 | |
michael@0 | 3337 | #ifdef HAS_ARGBCOPYALPHAROW_AVX2 |
michael@0 | 3338 | // width in pixels |
michael@0 | 3339 | void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { |
michael@0 | 3340 | asm volatile ( |
michael@0 | 3341 | "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" |
michael@0 | 3342 | "vpsrld $0x8,%%ymm0,%%ymm0 \n" |
michael@0 | 3343 | LABELALIGN |
michael@0 | 3344 | "1: \n" |
michael@0 | 3345 | "vmovdqu " MEMACCESS(0) ",%%ymm1 \n" |
michael@0 | 3346 | "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm2 \n" |
michael@0 | 3347 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 3348 | "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" |
michael@0 | 3349 | "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" |
michael@0 | 3350 | "vmovdqu %%ymm1," MEMACCESS(1) " \n" |
michael@0 | 3351 | "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" |
michael@0 | 3352 | "lea " MEMLEA(0x40,1) ",%1 \n" |
michael@0 | 3353 | "sub $0x10,%2 \n" |
michael@0 | 3354 | "jg 1b \n" |
michael@0 | 3355 | "vzeroupper \n" |
michael@0 | 3356 | : "+r"(src), // %0 |
michael@0 | 3357 | "+r"(dst), // %1 |
michael@0 | 3358 | "+r"(width) // %2 |
michael@0 | 3359 | : |
michael@0 | 3360 | : "memory", "cc" |
michael@0 | 3361 | #if defined(__SSE2__) |
michael@0 | 3362 | , "xmm0", "xmm1", "xmm2" |
michael@0 | 3363 | #endif |
michael@0 | 3364 | ); |
michael@0 | 3365 | } |
michael@0 | 3366 | #endif // HAS_ARGBCOPYALPHAROW_AVX2 |
michael@0 | 3367 | |
michael@0 | 3368 | #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 |
michael@0 | 3369 | // width in pixels |
michael@0 | 3370 | void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { |
michael@0 | 3371 | asm volatile ( |
michael@0 | 3372 | "pcmpeqb %%xmm0,%%xmm0 \n" |
michael@0 | 3373 | "pslld $0x18,%%xmm0 \n" |
michael@0 | 3374 | "pcmpeqb %%xmm1,%%xmm1 \n" |
michael@0 | 3375 | "psrld $0x8,%%xmm1 \n" |
michael@0 | 3376 | LABELALIGN |
michael@0 | 3377 | "1: \n" |
michael@0 | 3378 | "movq " MEMACCESS(0) ",%%xmm2 \n" |
michael@0 | 3379 | "lea " MEMLEA(0x8,0) ",%0 \n" |
michael@0 | 3380 | "punpcklbw %%xmm2,%%xmm2 \n" |
michael@0 | 3381 | "punpckhwd %%xmm2,%%xmm3 \n" |
michael@0 | 3382 | "punpcklwd %%xmm2,%%xmm2 \n" |
michael@0 | 3383 | "movdqa " MEMACCESS(1) ",%%xmm4 \n" |
michael@0 | 3384 | "movdqa " MEMACCESS2(0x10,1) ",%%xmm5 \n" |
michael@0 | 3385 | "pand %%xmm0,%%xmm2 \n" |
michael@0 | 3386 | "pand %%xmm0,%%xmm3 \n" |
michael@0 | 3387 | "pand %%xmm1,%%xmm4 \n" |
michael@0 | 3388 | "pand %%xmm1,%%xmm5 \n" |
michael@0 | 3389 | "por %%xmm4,%%xmm2 \n" |
michael@0 | 3390 | "por %%xmm5,%%xmm3 \n" |
michael@0 | 3391 | "movdqa %%xmm2," MEMACCESS(1) " \n" |
michael@0 | 3392 | "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" |
michael@0 | 3393 | "lea " MEMLEA(0x20,1) ",%1 \n" |
michael@0 | 3394 | "sub $0x8,%2 \n" |
michael@0 | 3395 | "jg 1b \n" |
michael@0 | 3396 | : "+r"(src), // %0 |
michael@0 | 3397 | "+r"(dst), // %1 |
michael@0 | 3398 | "+r"(width) // %2 |
michael@0 | 3399 | : |
michael@0 | 3400 | : "memory", "cc" |
michael@0 | 3401 | #if defined(__SSE2__) |
michael@0 | 3402 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 3403 | #endif |
michael@0 | 3404 | ); |
michael@0 | 3405 | } |
michael@0 | 3406 | #endif // HAS_ARGBCOPYYTOALPHAROW_SSE2 |
michael@0 | 3407 | |
michael@0 | 3408 | #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 |
michael@0 | 3409 | // width in pixels |
michael@0 | 3410 | void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { |
michael@0 | 3411 | asm volatile ( |
michael@0 | 3412 | "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" |
michael@0 | 3413 | "vpsrld $0x8,%%ymm0,%%ymm0 \n" |
michael@0 | 3414 | LABELALIGN |
michael@0 | 3415 | "1: \n" |
michael@0 | 3416 | "vpmovzxbd " MEMACCESS(0) ",%%ymm1 \n" |
michael@0 | 3417 | "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2 \n" |
michael@0 | 3418 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 3419 | "vpslld $0x18,%%ymm1,%%ymm1 \n" |
michael@0 | 3420 | "vpslld $0x18,%%ymm2,%%ymm2 \n" |
michael@0 | 3421 | "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1 \n" |
michael@0 | 3422 | "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2 \n" |
michael@0 | 3423 | "vmovdqu %%ymm1," MEMACCESS(1) " \n" |
michael@0 | 3424 | "vmovdqu %%ymm2," MEMACCESS2(0x20,1) " \n" |
michael@0 | 3425 | "lea " MEMLEA(0x40,1) ",%1 \n" |
michael@0 | 3426 | "sub $0x10,%2 \n" |
michael@0 | 3427 | "jg 1b \n" |
michael@0 | 3428 | "vzeroupper \n" |
michael@0 | 3429 | : "+r"(src), // %0 |
michael@0 | 3430 | "+r"(dst), // %1 |
michael@0 | 3431 | "+r"(width) // %2 |
michael@0 | 3432 | : |
michael@0 | 3433 | : "memory", "cc" |
michael@0 | 3434 | #if defined(__SSE2__) |
michael@0 | 3435 | , "xmm0", "xmm1", "xmm2" |
michael@0 | 3436 | #endif |
michael@0 | 3437 | ); |
michael@0 | 3438 | } |
michael@0 | 3439 | #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 |
michael@0 | 3440 | |
michael@0 | 3441 | #ifdef HAS_SETROW_X86 |
michael@0 | 3442 | void SetRow_X86(uint8* dst, uint32 v32, int width) { |
michael@0 | 3443 | size_t width_tmp = (size_t)(width); |
michael@0 | 3444 | asm volatile ( |
michael@0 | 3445 | "shr $0x2,%1 \n" |
michael@0 | 3446 | "rep stosl " MEMSTORESTRING(eax,0) " \n" |
michael@0 | 3447 | : "+D"(dst), // %0 |
michael@0 | 3448 | "+c"(width_tmp) // %1 |
michael@0 | 3449 | : "a"(v32) // %2 |
michael@0 | 3450 | : "memory", "cc"); |
michael@0 | 3451 | } |
michael@0 | 3452 | |
michael@0 | 3453 | void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, |
michael@0 | 3454 | int dst_stride, int height) { |
michael@0 | 3455 | for (int y = 0; y < height; ++y) { |
michael@0 | 3456 | size_t width_tmp = (size_t)(width); |
michael@0 | 3457 | uint32* d = (uint32*)(dst); |
michael@0 | 3458 | asm volatile ( |
michael@0 | 3459 | "rep stosl " MEMSTORESTRING(eax,0) " \n" |
michael@0 | 3460 | : "+D"(d), // %0 |
michael@0 | 3461 | "+c"(width_tmp) // %1 |
michael@0 | 3462 | : "a"(v32) // %2 |
michael@0 | 3463 | : "memory", "cc"); |
michael@0 | 3464 | dst += dst_stride; |
michael@0 | 3465 | } |
michael@0 | 3466 | } |
michael@0 | 3467 | #endif // HAS_SETROW_X86 |
michael@0 | 3468 | |
michael@0 | 3469 | #ifdef HAS_YUY2TOYROW_SSE2 |
michael@0 | 3470 | void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { |
michael@0 | 3471 | asm volatile ( |
michael@0 | 3472 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 3473 | "psrlw $0x8,%%xmm5 \n" |
michael@0 | 3474 | LABELALIGN |
michael@0 | 3475 | "1: \n" |
michael@0 | 3476 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3477 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 3478 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 3479 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 3480 | "pand %%xmm5,%%xmm1 \n" |
michael@0 | 3481 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 3482 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 3483 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 3484 | "sub $0x10,%2 \n" |
michael@0 | 3485 | "jg 1b \n" |
michael@0 | 3486 | : "+r"(src_yuy2), // %0 |
michael@0 | 3487 | "+r"(dst_y), // %1 |
michael@0 | 3488 | "+r"(pix) // %2 |
michael@0 | 3489 | : |
michael@0 | 3490 | : "memory", "cc" |
michael@0 | 3491 | #if defined(__SSE2__) |
michael@0 | 3492 | , "xmm0", "xmm1", "xmm5" |
michael@0 | 3493 | #endif |
michael@0 | 3494 | ); |
michael@0 | 3495 | } |
michael@0 | 3496 | |
michael@0 | 3497 | void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, |
michael@0 | 3498 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 3499 | asm volatile ( |
michael@0 | 3500 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 3501 | "psrlw $0x8,%%xmm5 \n" |
michael@0 | 3502 | "sub %1,%2 \n" |
michael@0 | 3503 | LABELALIGN |
michael@0 | 3504 | "1: \n" |
michael@0 | 3505 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3506 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 3507 | BUNDLEALIGN |
michael@0 | 3508 | MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 |
michael@0 | 3509 | MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 |
michael@0 | 3510 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 3511 | "pavgb %%xmm2,%%xmm0 \n" |
michael@0 | 3512 | "pavgb %%xmm3,%%xmm1 \n" |
michael@0 | 3513 | "psrlw $0x8,%%xmm0 \n" |
michael@0 | 3514 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 3515 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 3516 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 3517 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 3518 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 3519 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 3520 | "packuswb %%xmm1,%%xmm1 \n" |
michael@0 | 3521 | "movq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 3522 | BUNDLEALIGN |
michael@0 | 3523 | MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
michael@0 | 3524 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 3525 | "sub $0x10,%3 \n" |
michael@0 | 3526 | "jg 1b \n" |
michael@0 | 3527 | : "+r"(src_yuy2), // %0 |
michael@0 | 3528 | "+r"(dst_u), // %1 |
michael@0 | 3529 | "+r"(dst_v), // %2 |
michael@0 | 3530 | "+r"(pix) // %3 |
michael@0 | 3531 | : "r"((intptr_t)(stride_yuy2)) // %4 |
michael@0 | 3532 | : "memory", "cc" |
michael@0 | 3533 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 3534 | , "r14" |
michael@0 | 3535 | #endif |
michael@0 | 3536 | #if defined(__SSE2__) |
michael@0 | 3537 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
michael@0 | 3538 | #endif |
michael@0 | 3539 | ); |
michael@0 | 3540 | } |
michael@0 | 3541 | |
michael@0 | 3542 | void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, |
michael@0 | 3543 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 3544 | asm volatile ( |
michael@0 | 3545 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 3546 | "psrlw $0x8,%%xmm5 \n" |
michael@0 | 3547 | "sub %1,%2 \n" |
michael@0 | 3548 | LABELALIGN |
michael@0 | 3549 | "1: \n" |
michael@0 | 3550 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3551 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 3552 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 3553 | "psrlw $0x8,%%xmm0 \n" |
michael@0 | 3554 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 3555 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 3556 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 3557 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 3558 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 3559 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 3560 | "packuswb %%xmm1,%%xmm1 \n" |
michael@0 | 3561 | "movq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 3562 | BUNDLEALIGN |
michael@0 | 3563 | MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
michael@0 | 3564 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 3565 | "sub $0x10,%3 \n" |
michael@0 | 3566 | "jg 1b \n" |
michael@0 | 3567 | : "+r"(src_yuy2), // %0 |
michael@0 | 3568 | "+r"(dst_u), // %1 |
michael@0 | 3569 | "+r"(dst_v), // %2 |
michael@0 | 3570 | "+r"(pix) // %3 |
michael@0 | 3571 | : |
michael@0 | 3572 | : "memory", "cc" |
michael@0 | 3573 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 3574 | , "r14" |
michael@0 | 3575 | #endif |
michael@0 | 3576 | #if defined(__SSE2__) |
michael@0 | 3577 | , "xmm0", "xmm1", "xmm5" |
michael@0 | 3578 | #endif |
michael@0 | 3579 | ); |
michael@0 | 3580 | } |
michael@0 | 3581 | |
michael@0 | 3582 | void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, |
michael@0 | 3583 | uint8* dst_y, int pix) { |
michael@0 | 3584 | asm volatile ( |
michael@0 | 3585 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 3586 | "psrlw $0x8,%%xmm5 \n" |
michael@0 | 3587 | LABELALIGN |
michael@0 | 3588 | "1: \n" |
michael@0 | 3589 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3590 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 3591 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 3592 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 3593 | "pand %%xmm5,%%xmm1 \n" |
michael@0 | 3594 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 3595 | "sub $0x10,%2 \n" |
michael@0 | 3596 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 3597 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 3598 | "jg 1b \n" |
michael@0 | 3599 | : "+r"(src_yuy2), // %0 |
michael@0 | 3600 | "+r"(dst_y), // %1 |
michael@0 | 3601 | "+r"(pix) // %2 |
michael@0 | 3602 | : |
michael@0 | 3603 | : "memory", "cc" |
michael@0 | 3604 | #if defined(__SSE2__) |
michael@0 | 3605 | , "xmm0", "xmm1", "xmm5" |
michael@0 | 3606 | #endif |
michael@0 | 3607 | ); |
michael@0 | 3608 | } |
michael@0 | 3609 | |
michael@0 | 3610 | void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, |
michael@0 | 3611 | int stride_yuy2, |
michael@0 | 3612 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 3613 | asm volatile ( |
michael@0 | 3614 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 3615 | "psrlw $0x8,%%xmm5 \n" |
michael@0 | 3616 | "sub %1,%2 \n" |
michael@0 | 3617 | LABELALIGN |
michael@0 | 3618 | "1: \n" |
michael@0 | 3619 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3620 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 3621 | BUNDLEALIGN |
michael@0 | 3622 | MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 |
michael@0 | 3623 | MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 |
michael@0 | 3624 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 3625 | "pavgb %%xmm2,%%xmm0 \n" |
michael@0 | 3626 | "pavgb %%xmm3,%%xmm1 \n" |
michael@0 | 3627 | "psrlw $0x8,%%xmm0 \n" |
michael@0 | 3628 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 3629 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 3630 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 3631 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 3632 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 3633 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 3634 | "packuswb %%xmm1,%%xmm1 \n" |
michael@0 | 3635 | "movq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 3636 | BUNDLEALIGN |
michael@0 | 3637 | MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
michael@0 | 3638 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 3639 | "sub $0x10,%3 \n" |
michael@0 | 3640 | "jg 1b \n" |
michael@0 | 3641 | : "+r"(src_yuy2), // %0 |
michael@0 | 3642 | "+r"(dst_u), // %1 |
michael@0 | 3643 | "+r"(dst_v), // %2 |
michael@0 | 3644 | "+r"(pix) // %3 |
michael@0 | 3645 | : "r"((intptr_t)(stride_yuy2)) // %4 |
michael@0 | 3646 | : "memory", "cc" |
michael@0 | 3647 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 3648 | , "r14" |
michael@0 | 3649 | #endif |
michael@0 | 3650 | #if defined(__SSE2__) |
michael@0 | 3651 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
michael@0 | 3652 | #endif |
michael@0 | 3653 | ); |
michael@0 | 3654 | } |
michael@0 | 3655 | |
michael@0 | 3656 | void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, |
michael@0 | 3657 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 3658 | asm volatile ( |
michael@0 | 3659 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 3660 | "psrlw $0x8,%%xmm5 \n" |
michael@0 | 3661 | "sub %1,%2 \n" |
michael@0 | 3662 | LABELALIGN |
michael@0 | 3663 | "1: \n" |
michael@0 | 3664 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3665 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 3666 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 3667 | "psrlw $0x8,%%xmm0 \n" |
michael@0 | 3668 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 3669 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 3670 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 3671 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 3672 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 3673 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 3674 | "packuswb %%xmm1,%%xmm1 \n" |
michael@0 | 3675 | "movq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 3676 | BUNDLEALIGN |
michael@0 | 3677 | MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
michael@0 | 3678 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 3679 | "sub $0x10,%3 \n" |
michael@0 | 3680 | "jg 1b \n" |
michael@0 | 3681 | : "+r"(src_yuy2), // %0 |
michael@0 | 3682 | "+r"(dst_u), // %1 |
michael@0 | 3683 | "+r"(dst_v), // %2 |
michael@0 | 3684 | "+r"(pix) // %3 |
michael@0 | 3685 | : |
michael@0 | 3686 | : "memory", "cc" |
michael@0 | 3687 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 3688 | , "r14" |
michael@0 | 3689 | #endif |
michael@0 | 3690 | #if defined(__SSE2__) |
michael@0 | 3691 | , "xmm0", "xmm1", "xmm5" |
michael@0 | 3692 | #endif |
michael@0 | 3693 | ); |
michael@0 | 3694 | } |
michael@0 | 3695 | |
michael@0 | 3696 | void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { |
michael@0 | 3697 | asm volatile ( |
michael@0 | 3698 | LABELALIGN |
michael@0 | 3699 | "1: \n" |
michael@0 | 3700 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3701 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 3702 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 3703 | "psrlw $0x8,%%xmm0 \n" |
michael@0 | 3704 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 3705 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 3706 | "sub $0x10,%2 \n" |
michael@0 | 3707 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 3708 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 3709 | "jg 1b \n" |
michael@0 | 3710 | : "+r"(src_uyvy), // %0 |
michael@0 | 3711 | "+r"(dst_y), // %1 |
michael@0 | 3712 | "+r"(pix) // %2 |
michael@0 | 3713 | : |
michael@0 | 3714 | : "memory", "cc" |
michael@0 | 3715 | #if defined(__SSE2__) |
michael@0 | 3716 | , "xmm0", "xmm1" |
michael@0 | 3717 | #endif |
michael@0 | 3718 | ); |
michael@0 | 3719 | } |
michael@0 | 3720 | |
michael@0 | 3721 | void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, |
michael@0 | 3722 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 3723 | asm volatile ( |
michael@0 | 3724 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 3725 | "psrlw $0x8,%%xmm5 \n" |
michael@0 | 3726 | "sub %1,%2 \n" |
michael@0 | 3727 | LABELALIGN |
michael@0 | 3728 | "1: \n" |
michael@0 | 3729 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3730 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 3731 | BUNDLEALIGN |
michael@0 | 3732 | MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 |
michael@0 | 3733 | MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 |
michael@0 | 3734 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 3735 | "pavgb %%xmm2,%%xmm0 \n" |
michael@0 | 3736 | "pavgb %%xmm3,%%xmm1 \n" |
michael@0 | 3737 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 3738 | "pand %%xmm5,%%xmm1 \n" |
michael@0 | 3739 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 3740 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 3741 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 3742 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 3743 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 3744 | "packuswb %%xmm1,%%xmm1 \n" |
michael@0 | 3745 | "movq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 3746 | BUNDLEALIGN |
michael@0 | 3747 | MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
michael@0 | 3748 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 3749 | "sub $0x10,%3 \n" |
michael@0 | 3750 | "jg 1b \n" |
michael@0 | 3751 | : "+r"(src_uyvy), // %0 |
michael@0 | 3752 | "+r"(dst_u), // %1 |
michael@0 | 3753 | "+r"(dst_v), // %2 |
michael@0 | 3754 | "+r"(pix) // %3 |
michael@0 | 3755 | : "r"((intptr_t)(stride_uyvy)) // %4 |
michael@0 | 3756 | : "memory", "cc" |
michael@0 | 3757 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 3758 | , "r14" |
michael@0 | 3759 | #endif |
michael@0 | 3760 | #if defined(__SSE2__) |
michael@0 | 3761 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
michael@0 | 3762 | #endif |
michael@0 | 3763 | ); |
michael@0 | 3764 | } |
michael@0 | 3765 | |
michael@0 | 3766 | void UYVYToUV422Row_SSE2(const uint8* src_uyvy, |
michael@0 | 3767 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 3768 | asm volatile ( |
michael@0 | 3769 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 3770 | "psrlw $0x8,%%xmm5 \n" |
michael@0 | 3771 | "sub %1,%2 \n" |
michael@0 | 3772 | LABELALIGN |
michael@0 | 3773 | "1: \n" |
michael@0 | 3774 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3775 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 3776 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 3777 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 3778 | "pand %%xmm5,%%xmm1 \n" |
michael@0 | 3779 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 3780 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 3781 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 3782 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 3783 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 3784 | "packuswb %%xmm1,%%xmm1 \n" |
michael@0 | 3785 | "movq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 3786 | BUNDLEALIGN |
michael@0 | 3787 | MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
michael@0 | 3788 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 3789 | "sub $0x10,%3 \n" |
michael@0 | 3790 | "jg 1b \n" |
michael@0 | 3791 | : "+r"(src_uyvy), // %0 |
michael@0 | 3792 | "+r"(dst_u), // %1 |
michael@0 | 3793 | "+r"(dst_v), // %2 |
michael@0 | 3794 | "+r"(pix) // %3 |
michael@0 | 3795 | : |
michael@0 | 3796 | : "memory", "cc" |
michael@0 | 3797 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 3798 | , "r14" |
michael@0 | 3799 | #endif |
michael@0 | 3800 | #if defined(__SSE2__) |
michael@0 | 3801 | , "xmm0", "xmm1", "xmm5" |
michael@0 | 3802 | #endif |
michael@0 | 3803 | ); |
michael@0 | 3804 | } |
michael@0 | 3805 | |
michael@0 | 3806 | void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, |
michael@0 | 3807 | uint8* dst_y, int pix) { |
michael@0 | 3808 | asm volatile ( |
michael@0 | 3809 | LABELALIGN |
michael@0 | 3810 | "1: \n" |
michael@0 | 3811 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3812 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 3813 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 3814 | "psrlw $0x8,%%xmm0 \n" |
michael@0 | 3815 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 3816 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 3817 | "sub $0x10,%2 \n" |
michael@0 | 3818 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 3819 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 3820 | "jg 1b \n" |
michael@0 | 3821 | : "+r"(src_uyvy), // %0 |
michael@0 | 3822 | "+r"(dst_y), // %1 |
michael@0 | 3823 | "+r"(pix) // %2 |
michael@0 | 3824 | : |
michael@0 | 3825 | : "memory", "cc" |
michael@0 | 3826 | #if defined(__SSE2__) |
michael@0 | 3827 | , "xmm0", "xmm1" |
michael@0 | 3828 | #endif |
michael@0 | 3829 | ); |
michael@0 | 3830 | } |
michael@0 | 3831 | |
michael@0 | 3832 | void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, |
michael@0 | 3833 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 3834 | asm volatile ( |
michael@0 | 3835 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 3836 | "psrlw $0x8,%%xmm5 \n" |
michael@0 | 3837 | "sub %1,%2 \n" |
michael@0 | 3838 | LABELALIGN |
michael@0 | 3839 | "1: \n" |
michael@0 | 3840 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3841 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 3842 | BUNDLEALIGN |
michael@0 | 3843 | MEMOPREG(movdqu,0x00,0,4,1,xmm2) // movdqu (%0,%4,1),%%xmm2 |
michael@0 | 3844 | MEMOPREG(movdqu,0x10,0,4,1,xmm3) // movdqu 0x10(%0,%4,1),%%xmm3 |
michael@0 | 3845 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 3846 | "pavgb %%xmm2,%%xmm0 \n" |
michael@0 | 3847 | "pavgb %%xmm3,%%xmm1 \n" |
michael@0 | 3848 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 3849 | "pand %%xmm5,%%xmm1 \n" |
michael@0 | 3850 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 3851 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 3852 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 3853 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 3854 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 3855 | "packuswb %%xmm1,%%xmm1 \n" |
michael@0 | 3856 | "movq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 3857 | BUNDLEALIGN |
michael@0 | 3858 | MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
michael@0 | 3859 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 3860 | "sub $0x10,%3 \n" |
michael@0 | 3861 | "jg 1b \n" |
michael@0 | 3862 | : "+r"(src_uyvy), // %0 |
michael@0 | 3863 | "+r"(dst_u), // %1 |
michael@0 | 3864 | "+r"(dst_v), // %2 |
michael@0 | 3865 | "+r"(pix) // %3 |
michael@0 | 3866 | : "r"((intptr_t)(stride_uyvy)) // %4 |
michael@0 | 3867 | : "memory", "cc" |
michael@0 | 3868 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 3869 | , "r14" |
michael@0 | 3870 | #endif |
michael@0 | 3871 | #if defined(__SSE2__) |
michael@0 | 3872 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
michael@0 | 3873 | #endif |
michael@0 | 3874 | ); |
michael@0 | 3875 | } |
michael@0 | 3876 | |
michael@0 | 3877 | void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, |
michael@0 | 3878 | uint8* dst_u, uint8* dst_v, int pix) { |
michael@0 | 3879 | asm volatile ( |
michael@0 | 3880 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 3881 | "psrlw $0x8,%%xmm5 \n" |
michael@0 | 3882 | "sub %1,%2 \n" |
michael@0 | 3883 | LABELALIGN |
michael@0 | 3884 | "1: \n" |
michael@0 | 3885 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 3886 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 3887 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 3888 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 3889 | "pand %%xmm5,%%xmm1 \n" |
michael@0 | 3890 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 3891 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 3892 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 3893 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 3894 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 3895 | "packuswb %%xmm1,%%xmm1 \n" |
michael@0 | 3896 | "movq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 3897 | BUNDLEALIGN |
michael@0 | 3898 | MEMOPMEM(movq,xmm1,0x00,1,2,1) // movq %%xmm1,(%1,%2) |
michael@0 | 3899 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 3900 | "sub $0x10,%3 \n" |
michael@0 | 3901 | "jg 1b \n" |
michael@0 | 3902 | : "+r"(src_uyvy), // %0 |
michael@0 | 3903 | "+r"(dst_u), // %1 |
michael@0 | 3904 | "+r"(dst_v), // %2 |
michael@0 | 3905 | "+r"(pix) // %3 |
michael@0 | 3906 | : |
michael@0 | 3907 | : "memory", "cc" |
michael@0 | 3908 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 3909 | , "r14" |
michael@0 | 3910 | #endif |
michael@0 | 3911 | #if defined(__SSE2__) |
michael@0 | 3912 | , "xmm0", "xmm1", "xmm5" |
michael@0 | 3913 | #endif |
michael@0 | 3914 | ); |
michael@0 | 3915 | } |
michael@0 | 3916 | #endif // HAS_YUY2TOYROW_SSE2 |
michael@0 | 3917 | |
michael@0 | 3918 | #ifdef HAS_ARGBBLENDROW_SSE2 |
michael@0 | 3919 | // Blend 8 pixels at a time. |
michael@0 | 3920 | void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
michael@0 | 3921 | uint8* dst_argb, int width) { |
michael@0 | 3922 | asm volatile ( |
michael@0 | 3923 | "pcmpeqb %%xmm7,%%xmm7 \n" |
michael@0 | 3924 | "psrlw $0xf,%%xmm7 \n" |
michael@0 | 3925 | "pcmpeqb %%xmm6,%%xmm6 \n" |
michael@0 | 3926 | "psrlw $0x8,%%xmm6 \n" |
michael@0 | 3927 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 3928 | "psllw $0x8,%%xmm5 \n" |
michael@0 | 3929 | "pcmpeqb %%xmm4,%%xmm4 \n" |
michael@0 | 3930 | "pslld $0x18,%%xmm4 \n" |
michael@0 | 3931 | "sub $0x1,%3 \n" |
michael@0 | 3932 | "je 91f \n" |
michael@0 | 3933 | "jl 99f \n" |
michael@0 | 3934 | |
michael@0 | 3935 | // 1 pixel loop until destination pointer is aligned. |
michael@0 | 3936 | "10: \n" |
michael@0 | 3937 | "test $0xf,%2 \n" |
michael@0 | 3938 | "je 19f \n" |
michael@0 | 3939 | "movd " MEMACCESS(0) ",%%xmm3 \n" |
michael@0 | 3940 | "lea " MEMLEA(0x4,0) ",%0 \n" |
michael@0 | 3941 | "movdqa %%xmm3,%%xmm0 \n" |
michael@0 | 3942 | "pxor %%xmm4,%%xmm3 \n" |
michael@0 | 3943 | "movd " MEMACCESS(1) ",%%xmm2 \n" |
michael@0 | 3944 | "psrlw $0x8,%%xmm3 \n" |
michael@0 | 3945 | "pshufhw $0xf5,%%xmm3,%%xmm3 \n" |
michael@0 | 3946 | "pshuflw $0xf5,%%xmm3,%%xmm3 \n" |
michael@0 | 3947 | "pand %%xmm6,%%xmm2 \n" |
michael@0 | 3948 | "paddw %%xmm7,%%xmm3 \n" |
michael@0 | 3949 | "pmullw %%xmm3,%%xmm2 \n" |
michael@0 | 3950 | "movd " MEMACCESS(1) ",%%xmm1 \n" |
michael@0 | 3951 | "lea " MEMLEA(0x4,1) ",%1 \n" |
michael@0 | 3952 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 3953 | "por %%xmm4,%%xmm0 \n" |
michael@0 | 3954 | "pmullw %%xmm3,%%xmm1 \n" |
michael@0 | 3955 | "psrlw $0x8,%%xmm2 \n" |
michael@0 | 3956 | "paddusb %%xmm2,%%xmm0 \n" |
michael@0 | 3957 | "pand %%xmm5,%%xmm1 \n" |
michael@0 | 3958 | "paddusb %%xmm1,%%xmm0 \n" |
michael@0 | 3959 | "sub $0x1,%3 \n" |
michael@0 | 3960 | "movd %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 3961 | "lea " MEMLEA(0x4,2) ",%2 \n" |
michael@0 | 3962 | "jge 10b \n" |
michael@0 | 3963 | |
michael@0 | 3964 | "19: \n" |
michael@0 | 3965 | "add $1-4,%3 \n" |
michael@0 | 3966 | "jl 49f \n" |
michael@0 | 3967 | |
michael@0 | 3968 | // 4 pixel loop. |
michael@0 | 3969 | LABELALIGN |
michael@0 | 3970 | "41: \n" |
michael@0 | 3971 | "movdqu " MEMACCESS(0) ",%%xmm3 \n" |
michael@0 | 3972 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 3973 | "movdqa %%xmm3,%%xmm0 \n" |
michael@0 | 3974 | "pxor %%xmm4,%%xmm3 \n" |
michael@0 | 3975 | "movdqu " MEMACCESS(1) ",%%xmm2 \n" |
michael@0 | 3976 | "psrlw $0x8,%%xmm3 \n" |
michael@0 | 3977 | "pshufhw $0xf5,%%xmm3,%%xmm3 \n" |
michael@0 | 3978 | "pshuflw $0xf5,%%xmm3,%%xmm3 \n" |
michael@0 | 3979 | "pand %%xmm6,%%xmm2 \n" |
michael@0 | 3980 | "paddw %%xmm7,%%xmm3 \n" |
michael@0 | 3981 | "pmullw %%xmm3,%%xmm2 \n" |
michael@0 | 3982 | "movdqu " MEMACCESS(1) ",%%xmm1 \n" |
michael@0 | 3983 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 3984 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 3985 | "por %%xmm4,%%xmm0 \n" |
michael@0 | 3986 | "pmullw %%xmm3,%%xmm1 \n" |
michael@0 | 3987 | "psrlw $0x8,%%xmm2 \n" |
michael@0 | 3988 | "paddusb %%xmm2,%%xmm0 \n" |
michael@0 | 3989 | "pand %%xmm5,%%xmm1 \n" |
michael@0 | 3990 | "paddusb %%xmm1,%%xmm0 \n" |
michael@0 | 3991 | "sub $0x4,%3 \n" |
michael@0 | 3992 | "movdqa %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 3993 | "lea " MEMLEA(0x10,2) ",%2 \n" |
michael@0 | 3994 | "jge 41b \n" |
michael@0 | 3995 | |
michael@0 | 3996 | "49: \n" |
michael@0 | 3997 | "add $0x3,%3 \n" |
michael@0 | 3998 | "jl 99f \n" |
michael@0 | 3999 | |
michael@0 | 4000 | // 1 pixel loop. |
michael@0 | 4001 | "91: \n" |
michael@0 | 4002 | "movd " MEMACCESS(0) ",%%xmm3 \n" |
michael@0 | 4003 | "lea " MEMLEA(0x4,0) ",%0 \n" |
michael@0 | 4004 | "movdqa %%xmm3,%%xmm0 \n" |
michael@0 | 4005 | "pxor %%xmm4,%%xmm3 \n" |
michael@0 | 4006 | "movd " MEMACCESS(1) ",%%xmm2 \n" |
michael@0 | 4007 | "psrlw $0x8,%%xmm3 \n" |
michael@0 | 4008 | "pshufhw $0xf5,%%xmm3,%%xmm3 \n" |
michael@0 | 4009 | "pshuflw $0xf5,%%xmm3,%%xmm3 \n" |
michael@0 | 4010 | "pand %%xmm6,%%xmm2 \n" |
michael@0 | 4011 | "paddw %%xmm7,%%xmm3 \n" |
michael@0 | 4012 | "pmullw %%xmm3,%%xmm2 \n" |
michael@0 | 4013 | "movd " MEMACCESS(1) ",%%xmm1 \n" |
michael@0 | 4014 | "lea " MEMLEA(0x4,1) ",%1 \n" |
michael@0 | 4015 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 4016 | "por %%xmm4,%%xmm0 \n" |
michael@0 | 4017 | "pmullw %%xmm3,%%xmm1 \n" |
michael@0 | 4018 | "psrlw $0x8,%%xmm2 \n" |
michael@0 | 4019 | "paddusb %%xmm2,%%xmm0 \n" |
michael@0 | 4020 | "pand %%xmm5,%%xmm1 \n" |
michael@0 | 4021 | "paddusb %%xmm1,%%xmm0 \n" |
michael@0 | 4022 | "sub $0x1,%3 \n" |
michael@0 | 4023 | "movd %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 4024 | "lea " MEMLEA(0x4,2) ",%2 \n" |
michael@0 | 4025 | "jge 91b \n" |
michael@0 | 4026 | "99: \n" |
michael@0 | 4027 | : "+r"(src_argb0), // %0 |
michael@0 | 4028 | "+r"(src_argb1), // %1 |
michael@0 | 4029 | "+r"(dst_argb), // %2 |
michael@0 | 4030 | "+r"(width) // %3 |
michael@0 | 4031 | : |
michael@0 | 4032 | : "memory", "cc" |
michael@0 | 4033 | #if defined(__SSE2__) |
michael@0 | 4034 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
michael@0 | 4035 | #endif |
michael@0 | 4036 | ); |
michael@0 | 4037 | } |
michael@0 | 4038 | #endif // HAS_ARGBBLENDROW_SSE2 |
michael@0 | 4039 | |
michael@0 | 4040 | #ifdef HAS_ARGBBLENDROW_SSSE3 |
michael@0 | 4041 | // Shuffle table for isolating alpha. |
michael@0 | 4042 | static uvec8 kShuffleAlpha = { |
michael@0 | 4043 | 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, |
michael@0 | 4044 | 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 |
michael@0 | 4045 | }; |
michael@0 | 4046 | |
michael@0 | 4047 | // Blend 8 pixels at a time |
michael@0 | 4048 | // Shuffle table for reversing the bytes. |
michael@0 | 4049 | |
michael@0 | 4050 | // Same as SSE2, but replaces |
michael@0 | 4051 | // psrlw xmm3, 8 // alpha |
michael@0 | 4052 | // pshufhw xmm3, xmm3,0F5h // 8 alpha words |
michael@0 | 4053 | // pshuflw xmm3, xmm3,0F5h |
michael@0 | 4054 | // with.. |
michael@0 | 4055 | // pshufb xmm3, kShuffleAlpha // alpha |
michael@0 | 4056 | |
michael@0 | 4057 | void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, |
michael@0 | 4058 | uint8* dst_argb, int width) { |
michael@0 | 4059 | asm volatile ( |
michael@0 | 4060 | "pcmpeqb %%xmm7,%%xmm7 \n" |
michael@0 | 4061 | "psrlw $0xf,%%xmm7 \n" |
michael@0 | 4062 | "pcmpeqb %%xmm6,%%xmm6 \n" |
michael@0 | 4063 | "psrlw $0x8,%%xmm6 \n" |
michael@0 | 4064 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 4065 | "psllw $0x8,%%xmm5 \n" |
michael@0 | 4066 | "pcmpeqb %%xmm4,%%xmm4 \n" |
michael@0 | 4067 | "pslld $0x18,%%xmm4 \n" |
michael@0 | 4068 | "sub $0x1,%3 \n" |
michael@0 | 4069 | "je 91f \n" |
michael@0 | 4070 | "jl 99f \n" |
michael@0 | 4071 | |
michael@0 | 4072 | // 1 pixel loop until destination pointer is aligned. |
michael@0 | 4073 | "10: \n" |
michael@0 | 4074 | "test $0xf,%2 \n" |
michael@0 | 4075 | "je 19f \n" |
michael@0 | 4076 | "movd " MEMACCESS(0) ",%%xmm3 \n" |
michael@0 | 4077 | "lea " MEMLEA(0x4,0) ",%0 \n" |
michael@0 | 4078 | "movdqa %%xmm3,%%xmm0 \n" |
michael@0 | 4079 | "pxor %%xmm4,%%xmm3 \n" |
michael@0 | 4080 | "movd " MEMACCESS(1) ",%%xmm2 \n" |
michael@0 | 4081 | "pshufb %4,%%xmm3 \n" |
michael@0 | 4082 | "pand %%xmm6,%%xmm2 \n" |
michael@0 | 4083 | "paddw %%xmm7,%%xmm3 \n" |
michael@0 | 4084 | "pmullw %%xmm3,%%xmm2 \n" |
michael@0 | 4085 | "movd " MEMACCESS(1) ",%%xmm1 \n" |
michael@0 | 4086 | "lea " MEMLEA(0x4,1) ",%1 \n" |
michael@0 | 4087 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 4088 | "por %%xmm4,%%xmm0 \n" |
michael@0 | 4089 | "pmullw %%xmm3,%%xmm1 \n" |
michael@0 | 4090 | "psrlw $0x8,%%xmm2 \n" |
michael@0 | 4091 | "paddusb %%xmm2,%%xmm0 \n" |
michael@0 | 4092 | "pand %%xmm5,%%xmm1 \n" |
michael@0 | 4093 | "paddusb %%xmm1,%%xmm0 \n" |
michael@0 | 4094 | "sub $0x1,%3 \n" |
michael@0 | 4095 | "movd %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 4096 | "lea " MEMLEA(0x4,2) ",%2 \n" |
michael@0 | 4097 | "jge 10b \n" |
michael@0 | 4098 | |
michael@0 | 4099 | "19: \n" |
michael@0 | 4100 | "add $1-4,%3 \n" |
michael@0 | 4101 | "jl 49f \n" |
michael@0 | 4102 | "test $0xf,%0 \n" |
michael@0 | 4103 | "jne 41f \n" |
michael@0 | 4104 | "test $0xf,%1 \n" |
michael@0 | 4105 | "jne 41f \n" |
michael@0 | 4106 | |
michael@0 | 4107 | // 4 pixel loop. |
michael@0 | 4108 | LABELALIGN |
michael@0 | 4109 | "40: \n" |
michael@0 | 4110 | "movdqa " MEMACCESS(0) ",%%xmm3 \n" |
michael@0 | 4111 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 4112 | "movdqa %%xmm3,%%xmm0 \n" |
michael@0 | 4113 | "pxor %%xmm4,%%xmm3 \n" |
michael@0 | 4114 | "movdqa " MEMACCESS(1) ",%%xmm2 \n" |
michael@0 | 4115 | "pshufb %4,%%xmm3 \n" |
michael@0 | 4116 | "pand %%xmm6,%%xmm2 \n" |
michael@0 | 4117 | "paddw %%xmm7,%%xmm3 \n" |
michael@0 | 4118 | "pmullw %%xmm3,%%xmm2 \n" |
michael@0 | 4119 | "movdqa " MEMACCESS(1) ",%%xmm1 \n" |
michael@0 | 4120 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 4121 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 4122 | "por %%xmm4,%%xmm0 \n" |
michael@0 | 4123 | "pmullw %%xmm3,%%xmm1 \n" |
michael@0 | 4124 | "psrlw $0x8,%%xmm2 \n" |
michael@0 | 4125 | "paddusb %%xmm2,%%xmm0 \n" |
michael@0 | 4126 | "pand %%xmm5,%%xmm1 \n" |
michael@0 | 4127 | "paddusb %%xmm1,%%xmm0 \n" |
michael@0 | 4128 | "sub $0x4,%3 \n" |
michael@0 | 4129 | "movdqa %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 4130 | "lea " MEMLEA(0x10,2) ",%2 \n" |
michael@0 | 4131 | "jge 40b \n" |
michael@0 | 4132 | "jmp 49f \n" |
michael@0 | 4133 | |
michael@0 | 4134 | // 4 pixel unaligned loop. |
michael@0 | 4135 | LABELALIGN |
michael@0 | 4136 | "41: \n" |
michael@0 | 4137 | "movdqu " MEMACCESS(0) ",%%xmm3 \n" |
michael@0 | 4138 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 4139 | "movdqa %%xmm3,%%xmm0 \n" |
michael@0 | 4140 | "pxor %%xmm4,%%xmm3 \n" |
michael@0 | 4141 | "movdqu " MEMACCESS(1) ",%%xmm2 \n" |
michael@0 | 4142 | "pshufb %4,%%xmm3 \n" |
michael@0 | 4143 | "pand %%xmm6,%%xmm2 \n" |
michael@0 | 4144 | "paddw %%xmm7,%%xmm3 \n" |
michael@0 | 4145 | "pmullw %%xmm3,%%xmm2 \n" |
michael@0 | 4146 | "movdqu " MEMACCESS(1) ",%%xmm1 \n" |
michael@0 | 4147 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 4148 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 4149 | "por %%xmm4,%%xmm0 \n" |
michael@0 | 4150 | "pmullw %%xmm3,%%xmm1 \n" |
michael@0 | 4151 | "psrlw $0x8,%%xmm2 \n" |
michael@0 | 4152 | "paddusb %%xmm2,%%xmm0 \n" |
michael@0 | 4153 | "pand %%xmm5,%%xmm1 \n" |
michael@0 | 4154 | "paddusb %%xmm1,%%xmm0 \n" |
michael@0 | 4155 | "sub $0x4,%3 \n" |
michael@0 | 4156 | "movdqa %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 4157 | "lea " MEMLEA(0x10,2) ",%2 \n" |
michael@0 | 4158 | "jge 41b \n" |
michael@0 | 4159 | |
michael@0 | 4160 | "49: \n" |
michael@0 | 4161 | "add $0x3,%3 \n" |
michael@0 | 4162 | "jl 99f \n" |
michael@0 | 4163 | |
michael@0 | 4164 | // 1 pixel loop. |
michael@0 | 4165 | "91: \n" |
michael@0 | 4166 | "movd " MEMACCESS(0) ",%%xmm3 \n" |
michael@0 | 4167 | "lea " MEMLEA(0x4,0) ",%0 \n" |
michael@0 | 4168 | "movdqa %%xmm3,%%xmm0 \n" |
michael@0 | 4169 | "pxor %%xmm4,%%xmm3 \n" |
michael@0 | 4170 | "movd " MEMACCESS(1) ",%%xmm2 \n" |
michael@0 | 4171 | "pshufb %4,%%xmm3 \n" |
michael@0 | 4172 | "pand %%xmm6,%%xmm2 \n" |
michael@0 | 4173 | "paddw %%xmm7,%%xmm3 \n" |
michael@0 | 4174 | "pmullw %%xmm3,%%xmm2 \n" |
michael@0 | 4175 | "movd " MEMACCESS(1) ",%%xmm1 \n" |
michael@0 | 4176 | "lea " MEMLEA(0x4,1) ",%1 \n" |
michael@0 | 4177 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 4178 | "por %%xmm4,%%xmm0 \n" |
michael@0 | 4179 | "pmullw %%xmm3,%%xmm1 \n" |
michael@0 | 4180 | "psrlw $0x8,%%xmm2 \n" |
michael@0 | 4181 | "paddusb %%xmm2,%%xmm0 \n" |
michael@0 | 4182 | "pand %%xmm5,%%xmm1 \n" |
michael@0 | 4183 | "paddusb %%xmm1,%%xmm0 \n" |
michael@0 | 4184 | "sub $0x1,%3 \n" |
michael@0 | 4185 | "movd %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 4186 | "lea " MEMLEA(0x4,2) ",%2 \n" |
michael@0 | 4187 | "jge 91b \n" |
michael@0 | 4188 | "99: \n" |
michael@0 | 4189 | : "+r"(src_argb0), // %0 |
michael@0 | 4190 | "+r"(src_argb1), // %1 |
michael@0 | 4191 | "+r"(dst_argb), // %2 |
michael@0 | 4192 | "+r"(width) // %3 |
michael@0 | 4193 | : "m"(kShuffleAlpha) // %4 |
michael@0 | 4194 | : "memory", "cc" |
michael@0 | 4195 | #if defined(__SSE2__) |
michael@0 | 4196 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
michael@0 | 4197 | #endif |
michael@0 | 4198 | ); |
michael@0 | 4199 | } |
michael@0 | 4200 | #endif // HAS_ARGBBLENDROW_SSSE3 |
michael@0 | 4201 | |
michael@0 | 4202 | #ifdef HAS_ARGBATTENUATEROW_SSE2 |
michael@0 | 4203 | // Attenuate 4 pixels at a time. |
michael@0 | 4204 | // aligned to 16 bytes |
michael@0 | 4205 | void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { |
michael@0 | 4206 | asm volatile ( |
michael@0 | 4207 | "pcmpeqb %%xmm4,%%xmm4 \n" |
michael@0 | 4208 | "pslld $0x18,%%xmm4 \n" |
michael@0 | 4209 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 4210 | "psrld $0x8,%%xmm5 \n" |
michael@0 | 4211 | |
michael@0 | 4212 | // 4 pixel loop. |
michael@0 | 4213 | LABELALIGN |
michael@0 | 4214 | "1: \n" |
michael@0 | 4215 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 4216 | "punpcklbw %%xmm0,%%xmm0 \n" |
michael@0 | 4217 | "pshufhw $0xff,%%xmm0,%%xmm2 \n" |
michael@0 | 4218 | "pshuflw $0xff,%%xmm2,%%xmm2 \n" |
michael@0 | 4219 | "pmulhuw %%xmm2,%%xmm0 \n" |
michael@0 | 4220 | "movdqa " MEMACCESS(0) ",%%xmm1 \n" |
michael@0 | 4221 | "punpckhbw %%xmm1,%%xmm1 \n" |
michael@0 | 4222 | "pshufhw $0xff,%%xmm1,%%xmm2 \n" |
michael@0 | 4223 | "pshuflw $0xff,%%xmm2,%%xmm2 \n" |
michael@0 | 4224 | "pmulhuw %%xmm2,%%xmm1 \n" |
michael@0 | 4225 | "movdqa " MEMACCESS(0) ",%%xmm2 \n" |
michael@0 | 4226 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 4227 | "psrlw $0x8,%%xmm0 \n" |
michael@0 | 4228 | "pand %%xmm4,%%xmm2 \n" |
michael@0 | 4229 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 4230 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 4231 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 4232 | "por %%xmm2,%%xmm0 \n" |
michael@0 | 4233 | "sub $0x4,%2 \n" |
michael@0 | 4234 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 4235 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 4236 | "jg 1b \n" |
michael@0 | 4237 | : "+r"(src_argb), // %0 |
michael@0 | 4238 | "+r"(dst_argb), // %1 |
michael@0 | 4239 | "+r"(width) // %2 |
michael@0 | 4240 | : |
michael@0 | 4241 | : "memory", "cc" |
michael@0 | 4242 | #if defined(__SSE2__) |
michael@0 | 4243 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 4244 | #endif |
michael@0 | 4245 | ); |
michael@0 | 4246 | } |
michael@0 | 4247 | #endif // HAS_ARGBATTENUATEROW_SSE2 |
michael@0 | 4248 | |
michael@0 | 4249 | #ifdef HAS_ARGBATTENUATEROW_SSSE3 |
michael@0 | 4250 | // Shuffle table duplicating alpha |
michael@0 | 4251 | static uvec8 kShuffleAlpha0 = { |
michael@0 | 4252 | 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, |
michael@0 | 4253 | }; |
michael@0 | 4254 | static uvec8 kShuffleAlpha1 = { |
michael@0 | 4255 | 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, |
michael@0 | 4256 | 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, |
michael@0 | 4257 | }; |
michael@0 | 4258 | // Attenuate 4 pixels at a time. |
michael@0 | 4259 | // aligned to 16 bytes |
michael@0 | 4260 | void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { |
michael@0 | 4261 | asm volatile ( |
michael@0 | 4262 | "pcmpeqb %%xmm3,%%xmm3 \n" |
michael@0 | 4263 | "pslld $0x18,%%xmm3 \n" |
michael@0 | 4264 | "movdqa %3,%%xmm4 \n" |
michael@0 | 4265 | "movdqa %4,%%xmm5 \n" |
michael@0 | 4266 | |
michael@0 | 4267 | // 4 pixel loop. |
michael@0 | 4268 | LABELALIGN |
michael@0 | 4269 | "1: \n" |
michael@0 | 4270 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 4271 | "pshufb %%xmm4,%%xmm0 \n" |
michael@0 | 4272 | "movdqu " MEMACCESS(0) ",%%xmm1 \n" |
michael@0 | 4273 | "punpcklbw %%xmm1,%%xmm1 \n" |
michael@0 | 4274 | "pmulhuw %%xmm1,%%xmm0 \n" |
michael@0 | 4275 | "movdqu " MEMACCESS(0) ",%%xmm1 \n" |
michael@0 | 4276 | "pshufb %%xmm5,%%xmm1 \n" |
michael@0 | 4277 | "movdqu " MEMACCESS(0) ",%%xmm2 \n" |
michael@0 | 4278 | "punpckhbw %%xmm2,%%xmm2 \n" |
michael@0 | 4279 | "pmulhuw %%xmm2,%%xmm1 \n" |
michael@0 | 4280 | "movdqu " MEMACCESS(0) ",%%xmm2 \n" |
michael@0 | 4281 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 4282 | "pand %%xmm3,%%xmm2 \n" |
michael@0 | 4283 | "psrlw $0x8,%%xmm0 \n" |
michael@0 | 4284 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 4285 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 4286 | "por %%xmm2,%%xmm0 \n" |
michael@0 | 4287 | "sub $0x4,%2 \n" |
michael@0 | 4288 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 4289 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 4290 | "jg 1b \n" |
michael@0 | 4291 | : "+r"(src_argb), // %0 |
michael@0 | 4292 | "+r"(dst_argb), // %1 |
michael@0 | 4293 | "+r"(width) // %2 |
michael@0 | 4294 | : "m"(kShuffleAlpha0), // %3 |
michael@0 | 4295 | "m"(kShuffleAlpha1) // %4 |
michael@0 | 4296 | : "memory", "cc" |
michael@0 | 4297 | #if defined(__SSE2__) |
michael@0 | 4298 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 4299 | #endif |
michael@0 | 4300 | ); |
michael@0 | 4301 | } |
michael@0 | 4302 | #endif // HAS_ARGBATTENUATEROW_SSSE3 |
michael@0 | 4303 | |
michael@0 | 4304 | #ifdef HAS_ARGBUNATTENUATEROW_SSE2 |
michael@0 | 4305 | // Unattenuate 4 pixels at a time. |
michael@0 | 4306 | // aligned to 16 bytes |
michael@0 | 4307 | void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
michael@0 | 4308 | int width) { |
michael@0 | 4309 | uintptr_t alpha = 0; |
michael@0 | 4310 | asm volatile ( |
michael@0 | 4311 | // 4 pixel loop. |
michael@0 | 4312 | LABELALIGN |
michael@0 | 4313 | "1: \n" |
michael@0 | 4314 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 4315 | "movzb " MEMACCESS2(0x03,0) ",%3 \n" |
michael@0 | 4316 | "punpcklbw %%xmm0,%%xmm0 \n" |
michael@0 | 4317 | MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 |
michael@0 | 4318 | "movzb " MEMACCESS2(0x07,0) ",%3 \n" |
michael@0 | 4319 | MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 |
michael@0 | 4320 | "pshuflw $0x40,%%xmm2,%%xmm2 \n" |
michael@0 | 4321 | "pshuflw $0x40,%%xmm3,%%xmm3 \n" |
michael@0 | 4322 | "movlhps %%xmm3,%%xmm2 \n" |
michael@0 | 4323 | "pmulhuw %%xmm2,%%xmm0 \n" |
michael@0 | 4324 | "movdqu " MEMACCESS(0) ",%%xmm1 \n" |
michael@0 | 4325 | "movzb " MEMACCESS2(0x0b,0) ",%3 \n" |
michael@0 | 4326 | "punpckhbw %%xmm1,%%xmm1 \n" |
michael@0 | 4327 | BUNDLEALIGN |
michael@0 | 4328 | MEMOPREG(movd,0x00,4,3,4,xmm2) // movd 0x0(%4,%3,4),%%xmm2 |
michael@0 | 4329 | "movzb " MEMACCESS2(0x0f,0) ",%3 \n" |
michael@0 | 4330 | MEMOPREG(movd,0x00,4,3,4,xmm3) // movd 0x0(%4,%3,4),%%xmm3 |
michael@0 | 4331 | "pshuflw $0x40,%%xmm2,%%xmm2 \n" |
michael@0 | 4332 | "pshuflw $0x40,%%xmm3,%%xmm3 \n" |
michael@0 | 4333 | "movlhps %%xmm3,%%xmm2 \n" |
michael@0 | 4334 | "pmulhuw %%xmm2,%%xmm1 \n" |
michael@0 | 4335 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 4336 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 4337 | "sub $0x4,%2 \n" |
michael@0 | 4338 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 4339 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 4340 | "jg 1b \n" |
michael@0 | 4341 | : "+r"(src_argb), // %0 |
michael@0 | 4342 | "+r"(dst_argb), // %1 |
michael@0 | 4343 | "+r"(width), // %2 |
michael@0 | 4344 | "+r"(alpha) // %3 |
michael@0 | 4345 | : "r"(fixed_invtbl8) // %4 |
michael@0 | 4346 | : "memory", "cc" |
michael@0 | 4347 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 4348 | , "r14" |
michael@0 | 4349 | #endif |
michael@0 | 4350 | #if defined(__SSE2__) |
michael@0 | 4351 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 4352 | #endif |
michael@0 | 4353 | ); |
michael@0 | 4354 | } |
michael@0 | 4355 | #endif // HAS_ARGBUNATTENUATEROW_SSE2 |
michael@0 | 4356 | |
michael@0 | 4357 | #ifdef HAS_ARGBGRAYROW_SSSE3 |
michael@0 | 4358 | // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels |
michael@0 | 4359 | void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { |
michael@0 | 4360 | asm volatile ( |
michael@0 | 4361 | "movdqa %3,%%xmm4 \n" |
michael@0 | 4362 | "movdqa %4,%%xmm5 \n" |
michael@0 | 4363 | |
michael@0 | 4364 | // 8 pixel loop. |
michael@0 | 4365 | LABELALIGN |
michael@0 | 4366 | "1: \n" |
michael@0 | 4367 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 4368 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 4369 | "pmaddubsw %%xmm4,%%xmm0 \n" |
michael@0 | 4370 | "pmaddubsw %%xmm4,%%xmm1 \n" |
michael@0 | 4371 | "phaddw %%xmm1,%%xmm0 \n" |
michael@0 | 4372 | "paddw %%xmm5,%%xmm0 \n" |
michael@0 | 4373 | "psrlw $0x7,%%xmm0 \n" |
michael@0 | 4374 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 4375 | "movdqa " MEMACCESS(0) ",%%xmm2 \n" |
michael@0 | 4376 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm3 \n" |
michael@0 | 4377 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 4378 | "psrld $0x18,%%xmm2 \n" |
michael@0 | 4379 | "psrld $0x18,%%xmm3 \n" |
michael@0 | 4380 | "packuswb %%xmm3,%%xmm2 \n" |
michael@0 | 4381 | "packuswb %%xmm2,%%xmm2 \n" |
michael@0 | 4382 | "movdqa %%xmm0,%%xmm3 \n" |
michael@0 | 4383 | "punpcklbw %%xmm0,%%xmm0 \n" |
michael@0 | 4384 | "punpcklbw %%xmm2,%%xmm3 \n" |
michael@0 | 4385 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 4386 | "punpcklwd %%xmm3,%%xmm0 \n" |
michael@0 | 4387 | "punpckhwd %%xmm3,%%xmm1 \n" |
michael@0 | 4388 | "sub $0x8,%2 \n" |
michael@0 | 4389 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 4390 | "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" |
michael@0 | 4391 | "lea " MEMLEA(0x20,1) ",%1 \n" |
michael@0 | 4392 | "jg 1b \n" |
michael@0 | 4393 | : "+r"(src_argb), // %0 |
michael@0 | 4394 | "+r"(dst_argb), // %1 |
michael@0 | 4395 | "+r"(width) // %2 |
michael@0 | 4396 | : "m"(kARGBToYJ), // %3 |
michael@0 | 4397 | "m"(kAddYJ64) // %4 |
michael@0 | 4398 | : "memory", "cc" |
michael@0 | 4399 | #if defined(__SSE2__) |
michael@0 | 4400 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 4401 | #endif |
michael@0 | 4402 | ); |
michael@0 | 4403 | } |
michael@0 | 4404 | #endif // HAS_ARGBGRAYROW_SSSE3 |
michael@0 | 4405 | |
michael@0 | 4406 | #ifdef HAS_ARGBSEPIAROW_SSSE3 |
michael@0 | 4407 | // b = (r * 35 + g * 68 + b * 17) >> 7 |
michael@0 | 4408 | // g = (r * 45 + g * 88 + b * 22) >> 7 |
michael@0 | 4409 | // r = (r * 50 + g * 98 + b * 24) >> 7 |
michael@0 | 4410 | // Constant for ARGB color to sepia tone |
michael@0 | 4411 | static vec8 kARGBToSepiaB = { |
michael@0 | 4412 | 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 |
michael@0 | 4413 | }; |
michael@0 | 4414 | |
michael@0 | 4415 | static vec8 kARGBToSepiaG = { |
michael@0 | 4416 | 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 |
michael@0 | 4417 | }; |
michael@0 | 4418 | |
michael@0 | 4419 | static vec8 kARGBToSepiaR = { |
michael@0 | 4420 | 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 |
michael@0 | 4421 | }; |
michael@0 | 4422 | |
michael@0 | 4423 | // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. |
michael@0 | 4424 | void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { |
michael@0 | 4425 | asm volatile ( |
michael@0 | 4426 | "movdqa %2,%%xmm2 \n" |
michael@0 | 4427 | "movdqa %3,%%xmm3 \n" |
michael@0 | 4428 | "movdqa %4,%%xmm4 \n" |
michael@0 | 4429 | |
michael@0 | 4430 | // 8 pixel loop. |
michael@0 | 4431 | LABELALIGN |
michael@0 | 4432 | "1: \n" |
michael@0 | 4433 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 4434 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" |
michael@0 | 4435 | "pmaddubsw %%xmm2,%%xmm0 \n" |
michael@0 | 4436 | "pmaddubsw %%xmm2,%%xmm6 \n" |
michael@0 | 4437 | "phaddw %%xmm6,%%xmm0 \n" |
michael@0 | 4438 | "psrlw $0x7,%%xmm0 \n" |
michael@0 | 4439 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 4440 | "movdqa " MEMACCESS(0) ",%%xmm5 \n" |
michael@0 | 4441 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 4442 | "pmaddubsw %%xmm3,%%xmm5 \n" |
michael@0 | 4443 | "pmaddubsw %%xmm3,%%xmm1 \n" |
michael@0 | 4444 | "phaddw %%xmm1,%%xmm5 \n" |
michael@0 | 4445 | "psrlw $0x7,%%xmm5 \n" |
michael@0 | 4446 | "packuswb %%xmm5,%%xmm5 \n" |
michael@0 | 4447 | "punpcklbw %%xmm5,%%xmm0 \n" |
michael@0 | 4448 | "movdqa " MEMACCESS(0) ",%%xmm5 \n" |
michael@0 | 4449 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 4450 | "pmaddubsw %%xmm4,%%xmm5 \n" |
michael@0 | 4451 | "pmaddubsw %%xmm4,%%xmm1 \n" |
michael@0 | 4452 | "phaddw %%xmm1,%%xmm5 \n" |
michael@0 | 4453 | "psrlw $0x7,%%xmm5 \n" |
michael@0 | 4454 | "packuswb %%xmm5,%%xmm5 \n" |
michael@0 | 4455 | "movdqa " MEMACCESS(0) ",%%xmm6 \n" |
michael@0 | 4456 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 4457 | "psrld $0x18,%%xmm6 \n" |
michael@0 | 4458 | "psrld $0x18,%%xmm1 \n" |
michael@0 | 4459 | "packuswb %%xmm1,%%xmm6 \n" |
michael@0 | 4460 | "packuswb %%xmm6,%%xmm6 \n" |
michael@0 | 4461 | "punpcklbw %%xmm6,%%xmm5 \n" |
michael@0 | 4462 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 4463 | "punpcklwd %%xmm5,%%xmm0 \n" |
michael@0 | 4464 | "punpckhwd %%xmm5,%%xmm1 \n" |
michael@0 | 4465 | "sub $0x8,%1 \n" |
michael@0 | 4466 | "movdqa %%xmm0," MEMACCESS(0) " \n" |
michael@0 | 4467 | "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" |
michael@0 | 4468 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 4469 | "jg 1b \n" |
michael@0 | 4470 | : "+r"(dst_argb), // %0 |
michael@0 | 4471 | "+r"(width) // %1 |
michael@0 | 4472 | : "m"(kARGBToSepiaB), // %2 |
michael@0 | 4473 | "m"(kARGBToSepiaG), // %3 |
michael@0 | 4474 | "m"(kARGBToSepiaR) // %4 |
michael@0 | 4475 | : "memory", "cc" |
michael@0 | 4476 | #if defined(__SSE2__) |
michael@0 | 4477 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
michael@0 | 4478 | #endif |
michael@0 | 4479 | ); |
michael@0 | 4480 | } |
michael@0 | 4481 | #endif // HAS_ARGBSEPIAROW_SSSE3 |
michael@0 | 4482 | |
michael@0 | 4483 | #ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 |
michael@0 | 4484 | // Tranform 8 ARGB pixels (32 bytes) with color matrix. |
michael@0 | 4485 | // Same as Sepia except matrix is provided. |
michael@0 | 4486 | void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
michael@0 | 4487 | const int8* matrix_argb, int width) { |
michael@0 | 4488 | asm volatile ( |
michael@0 | 4489 | "movdqu " MEMACCESS(3) ",%%xmm5 \n" |
michael@0 | 4490 | "pshufd $0x00,%%xmm5,%%xmm2 \n" |
michael@0 | 4491 | "pshufd $0x55,%%xmm5,%%xmm3 \n" |
michael@0 | 4492 | "pshufd $0xaa,%%xmm5,%%xmm4 \n" |
michael@0 | 4493 | "pshufd $0xff,%%xmm5,%%xmm5 \n" |
michael@0 | 4494 | |
michael@0 | 4495 | // 8 pixel loop. |
michael@0 | 4496 | LABELALIGN |
michael@0 | 4497 | "1: \n" |
michael@0 | 4498 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 4499 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" |
michael@0 | 4500 | "pmaddubsw %%xmm2,%%xmm0 \n" |
michael@0 | 4501 | "pmaddubsw %%xmm2,%%xmm7 \n" |
michael@0 | 4502 | "movdqa " MEMACCESS(0) ",%%xmm6 \n" |
michael@0 | 4503 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 4504 | "pmaddubsw %%xmm3,%%xmm6 \n" |
michael@0 | 4505 | "pmaddubsw %%xmm3,%%xmm1 \n" |
michael@0 | 4506 | "phaddsw %%xmm7,%%xmm0 \n" |
michael@0 | 4507 | "phaddsw %%xmm1,%%xmm6 \n" |
michael@0 | 4508 | "psraw $0x6,%%xmm0 \n" |
michael@0 | 4509 | "psraw $0x6,%%xmm6 \n" |
michael@0 | 4510 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 4511 | "packuswb %%xmm6,%%xmm6 \n" |
michael@0 | 4512 | "punpcklbw %%xmm6,%%xmm0 \n" |
michael@0 | 4513 | "movdqa " MEMACCESS(0) ",%%xmm1 \n" |
michael@0 | 4514 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" |
michael@0 | 4515 | "pmaddubsw %%xmm4,%%xmm1 \n" |
michael@0 | 4516 | "pmaddubsw %%xmm4,%%xmm7 \n" |
michael@0 | 4517 | "phaddsw %%xmm7,%%xmm1 \n" |
michael@0 | 4518 | "movdqa " MEMACCESS(0) ",%%xmm6 \n" |
michael@0 | 4519 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm7 \n" |
michael@0 | 4520 | "pmaddubsw %%xmm5,%%xmm6 \n" |
michael@0 | 4521 | "pmaddubsw %%xmm5,%%xmm7 \n" |
michael@0 | 4522 | "phaddsw %%xmm7,%%xmm6 \n" |
michael@0 | 4523 | "psraw $0x6,%%xmm1 \n" |
michael@0 | 4524 | "psraw $0x6,%%xmm6 \n" |
michael@0 | 4525 | "packuswb %%xmm1,%%xmm1 \n" |
michael@0 | 4526 | "packuswb %%xmm6,%%xmm6 \n" |
michael@0 | 4527 | "punpcklbw %%xmm6,%%xmm1 \n" |
michael@0 | 4528 | "movdqa %%xmm0,%%xmm6 \n" |
michael@0 | 4529 | "punpcklwd %%xmm1,%%xmm0 \n" |
michael@0 | 4530 | "punpckhwd %%xmm1,%%xmm6 \n" |
michael@0 | 4531 | "sub $0x8,%2 \n" |
michael@0 | 4532 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 4533 | "movdqa %%xmm6," MEMACCESS2(0x10,1) " \n" |
michael@0 | 4534 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 4535 | "lea " MEMLEA(0x20,1) ",%1 \n" |
michael@0 | 4536 | "jg 1b \n" |
michael@0 | 4537 | : "+r"(src_argb), // %0 |
michael@0 | 4538 | "+r"(dst_argb), // %1 |
michael@0 | 4539 | "+r"(width) // %2 |
michael@0 | 4540 | : "r"(matrix_argb) // %3 |
michael@0 | 4541 | : "memory", "cc" |
michael@0 | 4542 | #if defined(__SSE2__) |
michael@0 | 4543 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
michael@0 | 4544 | #endif |
michael@0 | 4545 | ); |
michael@0 | 4546 | } |
michael@0 | 4547 | #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 |
michael@0 | 4548 | |
michael@0 | 4549 | #ifdef HAS_ARGBQUANTIZEROW_SSE2 |
michael@0 | 4550 | // Quantize 4 ARGB pixels (16 bytes). |
michael@0 | 4551 | // aligned to 16 bytes |
michael@0 | 4552 | void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, |
michael@0 | 4553 | int interval_offset, int width) { |
michael@0 | 4554 | asm volatile ( |
michael@0 | 4555 | "movd %2,%%xmm2 \n" |
michael@0 | 4556 | "movd %3,%%xmm3 \n" |
michael@0 | 4557 | "movd %4,%%xmm4 \n" |
michael@0 | 4558 | "pshuflw $0x40,%%xmm2,%%xmm2 \n" |
michael@0 | 4559 | "pshufd $0x44,%%xmm2,%%xmm2 \n" |
michael@0 | 4560 | "pshuflw $0x40,%%xmm3,%%xmm3 \n" |
michael@0 | 4561 | "pshufd $0x44,%%xmm3,%%xmm3 \n" |
michael@0 | 4562 | "pshuflw $0x40,%%xmm4,%%xmm4 \n" |
michael@0 | 4563 | "pshufd $0x44,%%xmm4,%%xmm4 \n" |
michael@0 | 4564 | "pxor %%xmm5,%%xmm5 \n" |
michael@0 | 4565 | "pcmpeqb %%xmm6,%%xmm6 \n" |
michael@0 | 4566 | "pslld $0x18,%%xmm6 \n" |
michael@0 | 4567 | |
michael@0 | 4568 | // 4 pixel loop. |
michael@0 | 4569 | LABELALIGN |
michael@0 | 4570 | "1: \n" |
michael@0 | 4571 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 4572 | "punpcklbw %%xmm5,%%xmm0 \n" |
michael@0 | 4573 | "pmulhuw %%xmm2,%%xmm0 \n" |
michael@0 | 4574 | "movdqa " MEMACCESS(0) ",%%xmm1 \n" |
michael@0 | 4575 | "punpckhbw %%xmm5,%%xmm1 \n" |
michael@0 | 4576 | "pmulhuw %%xmm2,%%xmm1 \n" |
michael@0 | 4577 | "pmullw %%xmm3,%%xmm0 \n" |
michael@0 | 4578 | "movdqa " MEMACCESS(0) ",%%xmm7 \n" |
michael@0 | 4579 | "pmullw %%xmm3,%%xmm1 \n" |
michael@0 | 4580 | "pand %%xmm6,%%xmm7 \n" |
michael@0 | 4581 | "paddw %%xmm4,%%xmm0 \n" |
michael@0 | 4582 | "paddw %%xmm4,%%xmm1 \n" |
michael@0 | 4583 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 4584 | "por %%xmm7,%%xmm0 \n" |
michael@0 | 4585 | "sub $0x4,%1 \n" |
michael@0 | 4586 | "movdqa %%xmm0," MEMACCESS(0) " \n" |
michael@0 | 4587 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 4588 | "jg 1b \n" |
michael@0 | 4589 | : "+r"(dst_argb), // %0 |
michael@0 | 4590 | "+r"(width) // %1 |
michael@0 | 4591 | : "r"(scale), // %2 |
michael@0 | 4592 | "r"(interval_size), // %3 |
michael@0 | 4593 | "r"(interval_offset) // %4 |
michael@0 | 4594 | : "memory", "cc" |
michael@0 | 4595 | #if defined(__SSE2__) |
michael@0 | 4596 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
michael@0 | 4597 | #endif |
michael@0 | 4598 | ); |
michael@0 | 4599 | } |
michael@0 | 4600 | #endif // HAS_ARGBQUANTIZEROW_SSE2 |
michael@0 | 4601 | |
michael@0 | 4602 | #ifdef HAS_ARGBSHADEROW_SSE2 |
michael@0 | 4603 | // Shade 4 pixels at a time by specified value. |
michael@0 | 4604 | // Aligned to 16 bytes. |
michael@0 | 4605 | void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, |
michael@0 | 4606 | uint32 value) { |
michael@0 | 4607 | asm volatile ( |
michael@0 | 4608 | "movd %3,%%xmm2 \n" |
michael@0 | 4609 | "punpcklbw %%xmm2,%%xmm2 \n" |
michael@0 | 4610 | "punpcklqdq %%xmm2,%%xmm2 \n" |
michael@0 | 4611 | |
michael@0 | 4612 | // 4 pixel loop. |
michael@0 | 4613 | LABELALIGN |
michael@0 | 4614 | "1: \n" |
michael@0 | 4615 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 4616 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 4617 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 4618 | "punpcklbw %%xmm0,%%xmm0 \n" |
michael@0 | 4619 | "punpckhbw %%xmm1,%%xmm1 \n" |
michael@0 | 4620 | "pmulhuw %%xmm2,%%xmm0 \n" |
michael@0 | 4621 | "pmulhuw %%xmm2,%%xmm1 \n" |
michael@0 | 4622 | "psrlw $0x8,%%xmm0 \n" |
michael@0 | 4623 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 4624 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 4625 | "sub $0x4,%2 \n" |
michael@0 | 4626 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 4627 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 4628 | "jg 1b \n" |
michael@0 | 4629 | : "+r"(src_argb), // %0 |
michael@0 | 4630 | "+r"(dst_argb), // %1 |
michael@0 | 4631 | "+r"(width) // %2 |
michael@0 | 4632 | : "r"(value) // %3 |
michael@0 | 4633 | : "memory", "cc" |
michael@0 | 4634 | #if defined(__SSE2__) |
michael@0 | 4635 | , "xmm0", "xmm1", "xmm2" |
michael@0 | 4636 | #endif |
michael@0 | 4637 | ); |
michael@0 | 4638 | } |
michael@0 | 4639 | #endif // HAS_ARGBSHADEROW_SSE2 |
michael@0 | 4640 | |
michael@0 | 4641 | #ifdef HAS_ARGBMULTIPLYROW_SSE2 |
michael@0 | 4642 | // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. |
michael@0 | 4643 | void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
michael@0 | 4644 | uint8* dst_argb, int width) { |
michael@0 | 4645 | asm volatile ( |
michael@0 | 4646 | "pxor %%xmm5,%%xmm5 \n" |
michael@0 | 4647 | |
michael@0 | 4648 | // 4 pixel loop. |
michael@0 | 4649 | LABELALIGN |
michael@0 | 4650 | "1: \n" |
michael@0 | 4651 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 4652 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 4653 | "movdqu " MEMACCESS(1) ",%%xmm2 \n" |
michael@0 | 4654 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 4655 | "movdqu %%xmm0,%%xmm1 \n" |
michael@0 | 4656 | "movdqu %%xmm2,%%xmm3 \n" |
michael@0 | 4657 | "punpcklbw %%xmm0,%%xmm0 \n" |
michael@0 | 4658 | "punpckhbw %%xmm1,%%xmm1 \n" |
michael@0 | 4659 | "punpcklbw %%xmm5,%%xmm2 \n" |
michael@0 | 4660 | "punpckhbw %%xmm5,%%xmm3 \n" |
michael@0 | 4661 | "pmulhuw %%xmm2,%%xmm0 \n" |
michael@0 | 4662 | "pmulhuw %%xmm3,%%xmm1 \n" |
michael@0 | 4663 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 4664 | "sub $0x4,%3 \n" |
michael@0 | 4665 | "movdqu %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 4666 | "lea " MEMLEA(0x10,2) ",%2 \n" |
michael@0 | 4667 | "jg 1b \n" |
michael@0 | 4668 | : "+r"(src_argb0), // %0 |
michael@0 | 4669 | "+r"(src_argb1), // %1 |
michael@0 | 4670 | "+r"(dst_argb), // %2 |
michael@0 | 4671 | "+r"(width) // %3 |
michael@0 | 4672 | : |
michael@0 | 4673 | : "memory", "cc" |
michael@0 | 4674 | #if defined(__SSE2__) |
michael@0 | 4675 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
michael@0 | 4676 | #endif |
michael@0 | 4677 | ); |
michael@0 | 4678 | } |
michael@0 | 4679 | #endif // HAS_ARGBMULTIPLYROW_SSE2 |
michael@0 | 4680 | |
michael@0 | 4681 | #ifdef HAS_ARGBADDROW_SSE2 |
michael@0 | 4682 | // Add 2 rows of ARGB pixels together, 4 pixels at a time. |
michael@0 | 4683 | void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
michael@0 | 4684 | uint8* dst_argb, int width) { |
michael@0 | 4685 | asm volatile ( |
michael@0 | 4686 | // 4 pixel loop. |
michael@0 | 4687 | LABELALIGN |
michael@0 | 4688 | "1: \n" |
michael@0 | 4689 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 4690 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 4691 | "movdqu " MEMACCESS(1) ",%%xmm1 \n" |
michael@0 | 4692 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 4693 | "paddusb %%xmm1,%%xmm0 \n" |
michael@0 | 4694 | "sub $0x4,%3 \n" |
michael@0 | 4695 | "movdqu %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 4696 | "lea " MEMLEA(0x10,2) ",%2 \n" |
michael@0 | 4697 | "jg 1b \n" |
michael@0 | 4698 | : "+r"(src_argb0), // %0 |
michael@0 | 4699 | "+r"(src_argb1), // %1 |
michael@0 | 4700 | "+r"(dst_argb), // %2 |
michael@0 | 4701 | "+r"(width) // %3 |
michael@0 | 4702 | : |
michael@0 | 4703 | : "memory", "cc" |
michael@0 | 4704 | #if defined(__SSE2__) |
michael@0 | 4705 | , "xmm0", "xmm1" |
michael@0 | 4706 | #endif |
michael@0 | 4707 | ); |
michael@0 | 4708 | } |
michael@0 | 4709 | #endif // HAS_ARGBADDROW_SSE2 |
michael@0 | 4710 | |
michael@0 | 4711 | #ifdef HAS_ARGBSUBTRACTROW_SSE2 |
michael@0 | 4712 | // Subtract 2 rows of ARGB pixels, 4 pixels at a time. |
michael@0 | 4713 | void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, |
michael@0 | 4714 | uint8* dst_argb, int width) { |
michael@0 | 4715 | asm volatile ( |
michael@0 | 4716 | // 4 pixel loop. |
michael@0 | 4717 | LABELALIGN |
michael@0 | 4718 | "1: \n" |
michael@0 | 4719 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 4720 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 4721 | "movdqu " MEMACCESS(1) ",%%xmm1 \n" |
michael@0 | 4722 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 4723 | "psubusb %%xmm1,%%xmm0 \n" |
michael@0 | 4724 | "sub $0x4,%3 \n" |
michael@0 | 4725 | "movdqu %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 4726 | "lea " MEMLEA(0x10,2) ",%2 \n" |
michael@0 | 4727 | "jg 1b \n" |
michael@0 | 4728 | : "+r"(src_argb0), // %0 |
michael@0 | 4729 | "+r"(src_argb1), // %1 |
michael@0 | 4730 | "+r"(dst_argb), // %2 |
michael@0 | 4731 | "+r"(width) // %3 |
michael@0 | 4732 | : |
michael@0 | 4733 | : "memory", "cc" |
michael@0 | 4734 | #if defined(__SSE2__) |
michael@0 | 4735 | , "xmm0", "xmm1" |
michael@0 | 4736 | #endif |
michael@0 | 4737 | ); |
michael@0 | 4738 | } |
michael@0 | 4739 | #endif // HAS_ARGBSUBTRACTROW_SSE2 |
michael@0 | 4740 | |
michael@0 | 4741 | #ifdef HAS_SOBELXROW_SSE2 |
michael@0 | 4742 | // SobelX as a matrix is |
michael@0 | 4743 | // -1 0 1 |
michael@0 | 4744 | // -2 0 2 |
michael@0 | 4745 | // -1 0 1 |
michael@0 | 4746 | void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, |
michael@0 | 4747 | const uint8* src_y2, uint8* dst_sobelx, int width) { |
michael@0 | 4748 | asm volatile ( |
michael@0 | 4749 | "sub %0,%1 \n" |
michael@0 | 4750 | "sub %0,%2 \n" |
michael@0 | 4751 | "sub %0,%3 \n" |
michael@0 | 4752 | "pxor %%xmm5,%%xmm5 \n" |
michael@0 | 4753 | |
michael@0 | 4754 | // 8 pixel loop. |
michael@0 | 4755 | LABELALIGN |
michael@0 | 4756 | "1: \n" |
michael@0 | 4757 | "movq " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 4758 | "movq " MEMACCESS2(0x2,0) ",%%xmm1 \n" |
michael@0 | 4759 | "punpcklbw %%xmm5,%%xmm0 \n" |
michael@0 | 4760 | "punpcklbw %%xmm5,%%xmm1 \n" |
michael@0 | 4761 | "psubw %%xmm1,%%xmm0 \n" |
michael@0 | 4762 | BUNDLEALIGN |
michael@0 | 4763 | MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 |
michael@0 | 4764 | MEMOPREG(movq,0x02,0,1,1,xmm2) // movq 0x2(%0,%1,1),%%xmm2 |
michael@0 | 4765 | "punpcklbw %%xmm5,%%xmm1 \n" |
michael@0 | 4766 | "punpcklbw %%xmm5,%%xmm2 \n" |
michael@0 | 4767 | "psubw %%xmm2,%%xmm1 \n" |
michael@0 | 4768 | BUNDLEALIGN |
michael@0 | 4769 | MEMOPREG(movq,0x00,0,2,1,xmm2) // movq (%0,%2,1),%%xmm2 |
michael@0 | 4770 | MEMOPREG(movq,0x02,0,2,1,xmm3) // movq 0x2(%0,%2,1),%%xmm3 |
michael@0 | 4771 | "punpcklbw %%xmm5,%%xmm2 \n" |
michael@0 | 4772 | "punpcklbw %%xmm5,%%xmm3 \n" |
michael@0 | 4773 | "psubw %%xmm3,%%xmm2 \n" |
michael@0 | 4774 | "paddw %%xmm2,%%xmm0 \n" |
michael@0 | 4775 | "paddw %%xmm1,%%xmm0 \n" |
michael@0 | 4776 | "paddw %%xmm1,%%xmm0 \n" |
michael@0 | 4777 | "pxor %%xmm1,%%xmm1 \n" |
michael@0 | 4778 | "psubw %%xmm0,%%xmm1 \n" |
michael@0 | 4779 | "pmaxsw %%xmm1,%%xmm0 \n" |
michael@0 | 4780 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 4781 | "sub $0x8,%4 \n" |
michael@0 | 4782 | BUNDLEALIGN |
michael@0 | 4783 | MEMOPMEM(movq,xmm0,0x00,0,3,1) // movq %%xmm0,(%0,%3,1) |
michael@0 | 4784 | "lea " MEMLEA(0x8,0) ",%0 \n" |
michael@0 | 4785 | "jg 1b \n" |
michael@0 | 4786 | : "+r"(src_y0), // %0 |
michael@0 | 4787 | "+r"(src_y1), // %1 |
michael@0 | 4788 | "+r"(src_y2), // %2 |
michael@0 | 4789 | "+r"(dst_sobelx), // %3 |
michael@0 | 4790 | "+r"(width) // %4 |
michael@0 | 4791 | : |
michael@0 | 4792 | : "memory", "cc" |
michael@0 | 4793 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 4794 | , "r14" |
michael@0 | 4795 | #endif |
michael@0 | 4796 | #if defined(__SSE2__) |
michael@0 | 4797 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
michael@0 | 4798 | #endif |
michael@0 | 4799 | ); |
michael@0 | 4800 | } |
michael@0 | 4801 | #endif // HAS_SOBELXROW_SSE2 |
michael@0 | 4802 | |
michael@0 | 4803 | #ifdef HAS_SOBELYROW_SSE2 |
michael@0 | 4804 | // SobelY as a matrix is |
michael@0 | 4805 | // -1 -2 -1 |
michael@0 | 4806 | // 0 0 0 |
michael@0 | 4807 | // 1 2 1 |
michael@0 | 4808 | void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, |
michael@0 | 4809 | uint8* dst_sobely, int width) { |
michael@0 | 4810 | asm volatile ( |
michael@0 | 4811 | "sub %0,%1 \n" |
michael@0 | 4812 | "sub %0,%2 \n" |
michael@0 | 4813 | "pxor %%xmm5,%%xmm5 \n" |
michael@0 | 4814 | |
michael@0 | 4815 | // 8 pixel loop. |
michael@0 | 4816 | LABELALIGN |
michael@0 | 4817 | "1: \n" |
michael@0 | 4818 | "movq " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 4819 | MEMOPREG(movq,0x00,0,1,1,xmm1) // movq (%0,%1,1),%%xmm1 |
michael@0 | 4820 | "punpcklbw %%xmm5,%%xmm0 \n" |
michael@0 | 4821 | "punpcklbw %%xmm5,%%xmm1 \n" |
michael@0 | 4822 | "psubw %%xmm1,%%xmm0 \n" |
michael@0 | 4823 | BUNDLEALIGN |
michael@0 | 4824 | "movq " MEMACCESS2(0x1,0) ",%%xmm1 \n" |
michael@0 | 4825 | MEMOPREG(movq,0x01,0,1,1,xmm2) // movq 0x1(%0,%1,1),%%xmm2 |
michael@0 | 4826 | "punpcklbw %%xmm5,%%xmm1 \n" |
michael@0 | 4827 | "punpcklbw %%xmm5,%%xmm2 \n" |
michael@0 | 4828 | "psubw %%xmm2,%%xmm1 \n" |
michael@0 | 4829 | BUNDLEALIGN |
michael@0 | 4830 | "movq " MEMACCESS2(0x2,0) ",%%xmm2 \n" |
michael@0 | 4831 | MEMOPREG(movq,0x02,0,1,1,xmm3) // movq 0x2(%0,%1,1),%%xmm3 |
michael@0 | 4832 | "punpcklbw %%xmm5,%%xmm2 \n" |
michael@0 | 4833 | "punpcklbw %%xmm5,%%xmm3 \n" |
michael@0 | 4834 | "psubw %%xmm3,%%xmm2 \n" |
michael@0 | 4835 | "paddw %%xmm2,%%xmm0 \n" |
michael@0 | 4836 | "paddw %%xmm1,%%xmm0 \n" |
michael@0 | 4837 | "paddw %%xmm1,%%xmm0 \n" |
michael@0 | 4838 | "pxor %%xmm1,%%xmm1 \n" |
michael@0 | 4839 | "psubw %%xmm0,%%xmm1 \n" |
michael@0 | 4840 | "pmaxsw %%xmm1,%%xmm0 \n" |
michael@0 | 4841 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 4842 | "sub $0x8,%3 \n" |
michael@0 | 4843 | BUNDLEALIGN |
michael@0 | 4844 | MEMOPMEM(movq,xmm0,0x00,0,2,1) // movq %%xmm0,(%0,%2,1) |
michael@0 | 4845 | "lea " MEMLEA(0x8,0) ",%0 \n" |
michael@0 | 4846 | "jg 1b \n" |
michael@0 | 4847 | : "+r"(src_y0), // %0 |
michael@0 | 4848 | "+r"(src_y1), // %1 |
michael@0 | 4849 | "+r"(dst_sobely), // %2 |
michael@0 | 4850 | "+r"(width) // %3 |
michael@0 | 4851 | : |
michael@0 | 4852 | : "memory", "cc" |
michael@0 | 4853 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 4854 | , "r14" |
michael@0 | 4855 | #endif |
michael@0 | 4856 | #if defined(__SSE2__) |
michael@0 | 4857 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
michael@0 | 4858 | #endif |
michael@0 | 4859 | ); |
michael@0 | 4860 | } |
michael@0 | 4861 | #endif // HAS_SOBELYROW_SSE2 |
michael@0 | 4862 | |
michael@0 | 4863 | #ifdef HAS_SOBELROW_SSE2 |
michael@0 | 4864 | // Adds Sobel X and Sobel Y and stores Sobel into ARGB. |
michael@0 | 4865 | // A = 255 |
michael@0 | 4866 | // R = Sobel |
michael@0 | 4867 | // G = Sobel |
michael@0 | 4868 | // B = Sobel |
michael@0 | 4869 | void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
michael@0 | 4870 | uint8* dst_argb, int width) { |
michael@0 | 4871 | asm volatile ( |
michael@0 | 4872 | "sub %0,%1 \n" |
michael@0 | 4873 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 4874 | "pslld $0x18,%%xmm5 \n" |
michael@0 | 4875 | |
michael@0 | 4876 | // 8 pixel loop. |
michael@0 | 4877 | LABELALIGN |
michael@0 | 4878 | "1: \n" |
michael@0 | 4879 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 4880 | MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 |
michael@0 | 4881 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 4882 | "paddusb %%xmm1,%%xmm0 \n" |
michael@0 | 4883 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 4884 | "punpcklbw %%xmm0,%%xmm2 \n" |
michael@0 | 4885 | "punpckhbw %%xmm0,%%xmm0 \n" |
michael@0 | 4886 | "movdqa %%xmm2,%%xmm1 \n" |
michael@0 | 4887 | "punpcklwd %%xmm2,%%xmm1 \n" |
michael@0 | 4888 | "punpckhwd %%xmm2,%%xmm2 \n" |
michael@0 | 4889 | "por %%xmm5,%%xmm1 \n" |
michael@0 | 4890 | "por %%xmm5,%%xmm2 \n" |
michael@0 | 4891 | "movdqa %%xmm0,%%xmm3 \n" |
michael@0 | 4892 | "punpcklwd %%xmm0,%%xmm3 \n" |
michael@0 | 4893 | "punpckhwd %%xmm0,%%xmm0 \n" |
michael@0 | 4894 | "por %%xmm5,%%xmm3 \n" |
michael@0 | 4895 | "por %%xmm5,%%xmm0 \n" |
michael@0 | 4896 | "sub $0x10,%3 \n" |
michael@0 | 4897 | "movdqa %%xmm1," MEMACCESS(2) " \n" |
michael@0 | 4898 | "movdqa %%xmm2," MEMACCESS2(0x10,2) " \n" |
michael@0 | 4899 | "movdqa %%xmm3," MEMACCESS2(0x20,2) " \n" |
michael@0 | 4900 | "movdqa %%xmm0," MEMACCESS2(0x30,2) " \n" |
michael@0 | 4901 | "lea " MEMLEA(0x40,2) ",%2 \n" |
michael@0 | 4902 | "jg 1b \n" |
michael@0 | 4903 | : "+r"(src_sobelx), // %0 |
michael@0 | 4904 | "+r"(src_sobely), // %1 |
michael@0 | 4905 | "+r"(dst_argb), // %2 |
michael@0 | 4906 | "+r"(width) // %3 |
michael@0 | 4907 | : |
michael@0 | 4908 | : "memory", "cc" |
michael@0 | 4909 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 4910 | , "r14" |
michael@0 | 4911 | #endif |
michael@0 | 4912 | #if defined(__SSE2__) |
michael@0 | 4913 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
michael@0 | 4914 | #endif |
michael@0 | 4915 | ); |
michael@0 | 4916 | } |
michael@0 | 4917 | #endif // HAS_SOBELROW_SSE2 |
michael@0 | 4918 | |
michael@0 | 4919 | #ifdef HAS_SOBELTOPLANEROW_SSE2 |
michael@0 | 4920 | // Adds Sobel X and Sobel Y and stores Sobel into a plane. |
michael@0 | 4921 | void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
michael@0 | 4922 | uint8* dst_y, int width) { |
michael@0 | 4923 | asm volatile ( |
michael@0 | 4924 | "sub %0,%1 \n" |
michael@0 | 4925 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 4926 | "pslld $0x18,%%xmm5 \n" |
michael@0 | 4927 | |
michael@0 | 4928 | // 8 pixel loop. |
michael@0 | 4929 | LABELALIGN |
michael@0 | 4930 | "1: \n" |
michael@0 | 4931 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 4932 | MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 |
michael@0 | 4933 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 4934 | "paddusb %%xmm1,%%xmm0 \n" |
michael@0 | 4935 | "sub $0x10,%3 \n" |
michael@0 | 4936 | "movdqa %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 4937 | "lea " MEMLEA(0x10,2) ",%2 \n" |
michael@0 | 4938 | "jg 1b \n" |
michael@0 | 4939 | : "+r"(src_sobelx), // %0 |
michael@0 | 4940 | "+r"(src_sobely), // %1 |
michael@0 | 4941 | "+r"(dst_y), // %2 |
michael@0 | 4942 | "+r"(width) // %3 |
michael@0 | 4943 | : |
michael@0 | 4944 | : "memory", "cc" |
michael@0 | 4945 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 4946 | , "r14" |
michael@0 | 4947 | #endif |
michael@0 | 4948 | #if defined(__SSE2__) |
michael@0 | 4949 | , "xmm0", "xmm1" |
michael@0 | 4950 | #endif |
michael@0 | 4951 | ); |
michael@0 | 4952 | } |
michael@0 | 4953 | #endif // HAS_SOBELTOPLANEROW_SSE2 |
michael@0 | 4954 | |
michael@0 | 4955 | #ifdef HAS_SOBELXYROW_SSE2 |
michael@0 | 4956 | // Mixes Sobel X, Sobel Y and Sobel into ARGB. |
michael@0 | 4957 | // A = 255 |
michael@0 | 4958 | // R = Sobel X |
michael@0 | 4959 | // G = Sobel |
michael@0 | 4960 | // B = Sobel Y |
michael@0 | 4961 | void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, |
michael@0 | 4962 | uint8* dst_argb, int width) { |
michael@0 | 4963 | asm volatile ( |
michael@0 | 4964 | "sub %0,%1 \n" |
michael@0 | 4965 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 4966 | |
michael@0 | 4967 | // 8 pixel loop. |
michael@0 | 4968 | LABELALIGN |
michael@0 | 4969 | "1: \n" |
michael@0 | 4970 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 4971 | MEMOPREG(movdqa,0x00,0,1,1,xmm1) // movdqa (%0,%1,1),%%xmm1 |
michael@0 | 4972 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 4973 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 4974 | "paddusb %%xmm1,%%xmm2 \n" |
michael@0 | 4975 | "movdqa %%xmm0,%%xmm3 \n" |
michael@0 | 4976 | "punpcklbw %%xmm5,%%xmm3 \n" |
michael@0 | 4977 | "punpckhbw %%xmm5,%%xmm0 \n" |
michael@0 | 4978 | "movdqa %%xmm1,%%xmm4 \n" |
michael@0 | 4979 | "punpcklbw %%xmm2,%%xmm4 \n" |
michael@0 | 4980 | "punpckhbw %%xmm2,%%xmm1 \n" |
michael@0 | 4981 | "movdqa %%xmm4,%%xmm6 \n" |
michael@0 | 4982 | "punpcklwd %%xmm3,%%xmm6 \n" |
michael@0 | 4983 | "punpckhwd %%xmm3,%%xmm4 \n" |
michael@0 | 4984 | "movdqa %%xmm1,%%xmm7 \n" |
michael@0 | 4985 | "punpcklwd %%xmm0,%%xmm7 \n" |
michael@0 | 4986 | "punpckhwd %%xmm0,%%xmm1 \n" |
michael@0 | 4987 | "sub $0x10,%3 \n" |
michael@0 | 4988 | "movdqa %%xmm6," MEMACCESS(2) " \n" |
michael@0 | 4989 | "movdqa %%xmm4," MEMACCESS2(0x10,2) " \n" |
michael@0 | 4990 | "movdqa %%xmm7," MEMACCESS2(0x20,2) " \n" |
michael@0 | 4991 | "movdqa %%xmm1," MEMACCESS2(0x30,2) " \n" |
michael@0 | 4992 | "lea " MEMLEA(0x40,2) ",%2 \n" |
michael@0 | 4993 | "jg 1b \n" |
michael@0 | 4994 | : "+r"(src_sobelx), // %0 |
michael@0 | 4995 | "+r"(src_sobely), // %1 |
michael@0 | 4996 | "+r"(dst_argb), // %2 |
michael@0 | 4997 | "+r"(width) // %3 |
michael@0 | 4998 | : |
michael@0 | 4999 | : "memory", "cc" |
michael@0 | 5000 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 5001 | , "r14" |
michael@0 | 5002 | #endif |
michael@0 | 5003 | #if defined(__SSE2__) |
michael@0 | 5004 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
michael@0 | 5005 | #endif |
michael@0 | 5006 | ); |
michael@0 | 5007 | } |
michael@0 | 5008 | #endif // HAS_SOBELXYROW_SSE2 |
michael@0 | 5009 | |
michael@0 | 5010 | #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 |
michael@0 | 5011 | // Creates a table of cumulative sums where each value is a sum of all values |
michael@0 | 5012 | // above and to the left of the value, inclusive of the value. |
michael@0 | 5013 | void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, |
michael@0 | 5014 | const int32* previous_cumsum, int width) { |
michael@0 | 5015 | asm volatile ( |
michael@0 | 5016 | "pxor %%xmm0,%%xmm0 \n" |
michael@0 | 5017 | "pxor %%xmm1,%%xmm1 \n" |
michael@0 | 5018 | "sub $0x4,%3 \n" |
michael@0 | 5019 | "jl 49f \n" |
michael@0 | 5020 | "test $0xf,%1 \n" |
michael@0 | 5021 | "jne 49f \n" |
michael@0 | 5022 | |
michael@0 | 5023 | // 4 pixel loop \n" |
michael@0 | 5024 | LABELALIGN |
michael@0 | 5025 | "40: \n" |
michael@0 | 5026 | "movdqu " MEMACCESS(0) ",%%xmm2 \n" |
michael@0 | 5027 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 5028 | "movdqa %%xmm2,%%xmm4 \n" |
michael@0 | 5029 | "punpcklbw %%xmm1,%%xmm2 \n" |
michael@0 | 5030 | "movdqa %%xmm2,%%xmm3 \n" |
michael@0 | 5031 | "punpcklwd %%xmm1,%%xmm2 \n" |
michael@0 | 5032 | "punpckhwd %%xmm1,%%xmm3 \n" |
michael@0 | 5033 | "punpckhbw %%xmm1,%%xmm4 \n" |
michael@0 | 5034 | "movdqa %%xmm4,%%xmm5 \n" |
michael@0 | 5035 | "punpcklwd %%xmm1,%%xmm4 \n" |
michael@0 | 5036 | "punpckhwd %%xmm1,%%xmm5 \n" |
michael@0 | 5037 | "paddd %%xmm2,%%xmm0 \n" |
michael@0 | 5038 | "movdqa " MEMACCESS(2) ",%%xmm2 \n" |
michael@0 | 5039 | "paddd %%xmm0,%%xmm2 \n" |
michael@0 | 5040 | "paddd %%xmm3,%%xmm0 \n" |
michael@0 | 5041 | "movdqa " MEMACCESS2(0x10,2) ",%%xmm3 \n" |
michael@0 | 5042 | "paddd %%xmm0,%%xmm3 \n" |
michael@0 | 5043 | "paddd %%xmm4,%%xmm0 \n" |
michael@0 | 5044 | "movdqa " MEMACCESS2(0x20,2) ",%%xmm4 \n" |
michael@0 | 5045 | "paddd %%xmm0,%%xmm4 \n" |
michael@0 | 5046 | "paddd %%xmm5,%%xmm0 \n" |
michael@0 | 5047 | "movdqa " MEMACCESS2(0x30,2) ",%%xmm5 \n" |
michael@0 | 5048 | "lea " MEMLEA(0x40,2) ",%2 \n" |
michael@0 | 5049 | "paddd %%xmm0,%%xmm5 \n" |
michael@0 | 5050 | "movdqa %%xmm2," MEMACCESS(1) " \n" |
michael@0 | 5051 | "movdqa %%xmm3," MEMACCESS2(0x10,1) " \n" |
michael@0 | 5052 | "movdqa %%xmm4," MEMACCESS2(0x20,1) " \n" |
michael@0 | 5053 | "movdqa %%xmm5," MEMACCESS2(0x30,1) " \n" |
michael@0 | 5054 | "lea " MEMLEA(0x40,1) ",%1 \n" |
michael@0 | 5055 | "sub $0x4,%3 \n" |
michael@0 | 5056 | "jge 40b \n" |
michael@0 | 5057 | |
michael@0 | 5058 | "49: \n" |
michael@0 | 5059 | "add $0x3,%3 \n" |
michael@0 | 5060 | "jl 19f \n" |
michael@0 | 5061 | |
michael@0 | 5062 | // 1 pixel loop \n" |
michael@0 | 5063 | LABELALIGN |
michael@0 | 5064 | "10: \n" |
michael@0 | 5065 | "movd " MEMACCESS(0) ",%%xmm2 \n" |
michael@0 | 5066 | "lea " MEMLEA(0x4,0) ",%0 \n" |
michael@0 | 5067 | "punpcklbw %%xmm1,%%xmm2 \n" |
michael@0 | 5068 | "punpcklwd %%xmm1,%%xmm2 \n" |
michael@0 | 5069 | "paddd %%xmm2,%%xmm0 \n" |
michael@0 | 5070 | "movdqu " MEMACCESS(2) ",%%xmm2 \n" |
michael@0 | 5071 | "lea " MEMLEA(0x10,2) ",%2 \n" |
michael@0 | 5072 | "paddd %%xmm0,%%xmm2 \n" |
michael@0 | 5073 | "movdqu %%xmm2," MEMACCESS(1) " \n" |
michael@0 | 5074 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5075 | "sub $0x1,%3 \n" |
michael@0 | 5076 | "jge 10b \n" |
michael@0 | 5077 | |
michael@0 | 5078 | "19: \n" |
michael@0 | 5079 | : "+r"(row), // %0 |
michael@0 | 5080 | "+r"(cumsum), // %1 |
michael@0 | 5081 | "+r"(previous_cumsum), // %2 |
michael@0 | 5082 | "+r"(width) // %3 |
michael@0 | 5083 | : |
michael@0 | 5084 | : "memory", "cc" |
michael@0 | 5085 | #if defined(__SSE2__) |
michael@0 | 5086 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 5087 | #endif |
michael@0 | 5088 | ); |
michael@0 | 5089 | } |
michael@0 | 5090 | #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 |
michael@0 | 5091 | |
michael@0 | 5092 | #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 |
michael@0 | 5093 | void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, |
michael@0 | 5094 | int width, int area, uint8* dst, |
michael@0 | 5095 | int count) { |
michael@0 | 5096 | asm volatile ( |
michael@0 | 5097 | "movd %5,%%xmm5 \n" |
michael@0 | 5098 | "cvtdq2ps %%xmm5,%%xmm5 \n" |
michael@0 | 5099 | "rcpss %%xmm5,%%xmm4 \n" |
michael@0 | 5100 | "pshufd $0x0,%%xmm4,%%xmm4 \n" |
michael@0 | 5101 | "sub $0x4,%3 \n" |
michael@0 | 5102 | "jl 49f \n" |
michael@0 | 5103 | "cmpl $0x80,%5 \n" |
michael@0 | 5104 | "ja 40f \n" |
michael@0 | 5105 | |
michael@0 | 5106 | "pshufd $0x0,%%xmm5,%%xmm5 \n" |
michael@0 | 5107 | "pcmpeqb %%xmm6,%%xmm6 \n" |
michael@0 | 5108 | "psrld $0x10,%%xmm6 \n" |
michael@0 | 5109 | "cvtdq2ps %%xmm6,%%xmm6 \n" |
michael@0 | 5110 | "addps %%xmm6,%%xmm5 \n" |
michael@0 | 5111 | "mulps %%xmm4,%%xmm5 \n" |
michael@0 | 5112 | "cvtps2dq %%xmm5,%%xmm5 \n" |
michael@0 | 5113 | "packssdw %%xmm5,%%xmm5 \n" |
michael@0 | 5114 | |
michael@0 | 5115 | // 4 pixel small loop \n" |
michael@0 | 5116 | LABELALIGN |
michael@0 | 5117 | "4: \n" |
michael@0 | 5118 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 5119 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 5120 | "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 5121 | "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
michael@0 | 5122 | BUNDLEALIGN |
michael@0 | 5123 | MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 |
michael@0 | 5124 | MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 |
michael@0 | 5125 | MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 |
michael@0 | 5126 | MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 |
michael@0 | 5127 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 5128 | "psubd " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5129 | "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" |
michael@0 | 5130 | "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" |
michael@0 | 5131 | "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" |
michael@0 | 5132 | BUNDLEALIGN |
michael@0 | 5133 | MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 |
michael@0 | 5134 | MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 |
michael@0 | 5135 | MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 |
michael@0 | 5136 | MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 |
michael@0 | 5137 | "lea " MEMLEA(0x40,1) ",%1 \n" |
michael@0 | 5138 | "packssdw %%xmm1,%%xmm0 \n" |
michael@0 | 5139 | "packssdw %%xmm3,%%xmm2 \n" |
michael@0 | 5140 | "pmulhuw %%xmm5,%%xmm0 \n" |
michael@0 | 5141 | "pmulhuw %%xmm5,%%xmm2 \n" |
michael@0 | 5142 | "packuswb %%xmm2,%%xmm0 \n" |
michael@0 | 5143 | "movdqu %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 5144 | "lea " MEMLEA(0x10,2) ",%2 \n" |
michael@0 | 5145 | "sub $0x4,%3 \n" |
michael@0 | 5146 | "jge 4b \n" |
michael@0 | 5147 | "jmp 49f \n" |
michael@0 | 5148 | |
michael@0 | 5149 | // 4 pixel loop \n" |
michael@0 | 5150 | LABELALIGN |
michael@0 | 5151 | "40: \n" |
michael@0 | 5152 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 5153 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 5154 | "movdqa " MEMACCESS2(0x20,0) ",%%xmm2 \n" |
michael@0 | 5155 | "movdqa " MEMACCESS2(0x30,0) ",%%xmm3 \n" |
michael@0 | 5156 | BUNDLEALIGN |
michael@0 | 5157 | MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 |
michael@0 | 5158 | MEMOPREG(psubd,0x10,0,4,4,xmm1) // psubd 0x10(%0,%4,4),%%xmm1 |
michael@0 | 5159 | MEMOPREG(psubd,0x20,0,4,4,xmm2) // psubd 0x20(%0,%4,4),%%xmm2 |
michael@0 | 5160 | MEMOPREG(psubd,0x30,0,4,4,xmm3) // psubd 0x30(%0,%4,4),%%xmm3 |
michael@0 | 5161 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 5162 | "psubd " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5163 | "psubd " MEMACCESS2(0x10,1) ",%%xmm1 \n" |
michael@0 | 5164 | "psubd " MEMACCESS2(0x20,1) ",%%xmm2 \n" |
michael@0 | 5165 | "psubd " MEMACCESS2(0x30,1) ",%%xmm3 \n" |
michael@0 | 5166 | BUNDLEALIGN |
michael@0 | 5167 | MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 |
michael@0 | 5168 | MEMOPREG(paddd,0x10,1,4,4,xmm1) // paddd 0x10(%1,%4,4),%%xmm1 |
michael@0 | 5169 | MEMOPREG(paddd,0x20,1,4,4,xmm2) // paddd 0x20(%1,%4,4),%%xmm2 |
michael@0 | 5170 | MEMOPREG(paddd,0x30,1,4,4,xmm3) // paddd 0x30(%1,%4,4),%%xmm3 |
michael@0 | 5171 | "lea " MEMLEA(0x40,1) ",%1 \n" |
michael@0 | 5172 | "cvtdq2ps %%xmm0,%%xmm0 \n" |
michael@0 | 5173 | "cvtdq2ps %%xmm1,%%xmm1 \n" |
michael@0 | 5174 | "mulps %%xmm4,%%xmm0 \n" |
michael@0 | 5175 | "mulps %%xmm4,%%xmm1 \n" |
michael@0 | 5176 | "cvtdq2ps %%xmm2,%%xmm2 \n" |
michael@0 | 5177 | "cvtdq2ps %%xmm3,%%xmm3 \n" |
michael@0 | 5178 | "mulps %%xmm4,%%xmm2 \n" |
michael@0 | 5179 | "mulps %%xmm4,%%xmm3 \n" |
michael@0 | 5180 | "cvtps2dq %%xmm0,%%xmm0 \n" |
michael@0 | 5181 | "cvtps2dq %%xmm1,%%xmm1 \n" |
michael@0 | 5182 | "cvtps2dq %%xmm2,%%xmm2 \n" |
michael@0 | 5183 | "cvtps2dq %%xmm3,%%xmm3 \n" |
michael@0 | 5184 | "packssdw %%xmm1,%%xmm0 \n" |
michael@0 | 5185 | "packssdw %%xmm3,%%xmm2 \n" |
michael@0 | 5186 | "packuswb %%xmm2,%%xmm0 \n" |
michael@0 | 5187 | "movdqu %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 5188 | "lea " MEMLEA(0x10,2) ",%2 \n" |
michael@0 | 5189 | "sub $0x4,%3 \n" |
michael@0 | 5190 | "jge 40b \n" |
michael@0 | 5191 | |
michael@0 | 5192 | "49: \n" |
michael@0 | 5193 | "add $0x3,%3 \n" |
michael@0 | 5194 | "jl 19f \n" |
michael@0 | 5195 | |
michael@0 | 5196 | // 1 pixel loop \n" |
michael@0 | 5197 | LABELALIGN |
michael@0 | 5198 | "10: \n" |
michael@0 | 5199 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 5200 | MEMOPREG(psubd,0x00,0,4,4,xmm0) // psubd 0x00(%0,%4,4),%%xmm0 |
michael@0 | 5201 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 5202 | "psubd " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5203 | BUNDLEALIGN |
michael@0 | 5204 | MEMOPREG(paddd,0x00,1,4,4,xmm0) // paddd 0x00(%1,%4,4),%%xmm0 |
michael@0 | 5205 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5206 | "cvtdq2ps %%xmm0,%%xmm0 \n" |
michael@0 | 5207 | "mulps %%xmm4,%%xmm0 \n" |
michael@0 | 5208 | "cvtps2dq %%xmm0,%%xmm0 \n" |
michael@0 | 5209 | "packssdw %%xmm0,%%xmm0 \n" |
michael@0 | 5210 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 5211 | "movd %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 5212 | "lea " MEMLEA(0x4,2) ",%2 \n" |
michael@0 | 5213 | "sub $0x1,%3 \n" |
michael@0 | 5214 | "jge 10b \n" |
michael@0 | 5215 | "19: \n" |
michael@0 | 5216 | : "+r"(topleft), // %0 |
michael@0 | 5217 | "+r"(botleft), // %1 |
michael@0 | 5218 | "+r"(dst), // %2 |
michael@0 | 5219 | "+rm"(count) // %3 |
michael@0 | 5220 | : "r"((intptr_t)(width)), // %4 |
michael@0 | 5221 | "rm"(area) // %5 |
michael@0 | 5222 | : "memory", "cc" |
michael@0 | 5223 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 5224 | , "r14" |
michael@0 | 5225 | #endif |
michael@0 | 5226 | #if defined(__SSE2__) |
michael@0 | 5227 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
michael@0 | 5228 | #endif |
michael@0 | 5229 | ); |
michael@0 | 5230 | } |
michael@0 | 5231 | #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 |
michael@0 | 5232 | |
michael@0 | 5233 | #ifdef HAS_ARGBAFFINEROW_SSE2 |
michael@0 | 5234 | // Copy ARGB pixels from source image with slope to a row of destination. |
michael@0 | 5235 | LIBYUV_API |
michael@0 | 5236 | void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, |
michael@0 | 5237 | uint8* dst_argb, const float* src_dudv, int width) { |
michael@0 | 5238 | intptr_t src_argb_stride_temp = src_argb_stride; |
michael@0 | 5239 | intptr_t temp = 0; |
michael@0 | 5240 | asm volatile ( |
michael@0 | 5241 | "movq " MEMACCESS(3) ",%%xmm2 \n" |
michael@0 | 5242 | "movq " MEMACCESS2(0x08,3) ",%%xmm7 \n" |
michael@0 | 5243 | "shl $0x10,%1 \n" |
michael@0 | 5244 | "add $0x4,%1 \n" |
michael@0 | 5245 | "movd %1,%%xmm5 \n" |
michael@0 | 5246 | "sub $0x4,%4 \n" |
michael@0 | 5247 | "jl 49f \n" |
michael@0 | 5248 | |
michael@0 | 5249 | "pshufd $0x44,%%xmm7,%%xmm7 \n" |
michael@0 | 5250 | "pshufd $0x0,%%xmm5,%%xmm5 \n" |
michael@0 | 5251 | "movdqa %%xmm2,%%xmm0 \n" |
michael@0 | 5252 | "addps %%xmm7,%%xmm0 \n" |
michael@0 | 5253 | "movlhps %%xmm0,%%xmm2 \n" |
michael@0 | 5254 | "movdqa %%xmm7,%%xmm4 \n" |
michael@0 | 5255 | "addps %%xmm4,%%xmm4 \n" |
michael@0 | 5256 | "movdqa %%xmm2,%%xmm3 \n" |
michael@0 | 5257 | "addps %%xmm4,%%xmm3 \n" |
michael@0 | 5258 | "addps %%xmm4,%%xmm4 \n" |
michael@0 | 5259 | |
michael@0 | 5260 | // 4 pixel loop \n" |
michael@0 | 5261 | LABELALIGN |
michael@0 | 5262 | "40: \n" |
michael@0 | 5263 | "cvttps2dq %%xmm2,%%xmm0 \n" // x, y float to int first 2 |
michael@0 | 5264 | "cvttps2dq %%xmm3,%%xmm1 \n" // x, y float to int next 2 |
michael@0 | 5265 | "packssdw %%xmm1,%%xmm0 \n" // x, y as 8 shorts |
michael@0 | 5266 | "pmaddwd %%xmm5,%%xmm0 \n" // off = x * 4 + y * stride |
michael@0 | 5267 | "movd %%xmm0,%k1 \n" |
michael@0 | 5268 | "pshufd $0x39,%%xmm0,%%xmm0 \n" |
michael@0 | 5269 | "movd %%xmm0,%k5 \n" |
michael@0 | 5270 | "pshufd $0x39,%%xmm0,%%xmm0 \n" |
michael@0 | 5271 | BUNDLEALIGN |
michael@0 | 5272 | MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 |
michael@0 | 5273 | MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 |
michael@0 | 5274 | "punpckldq %%xmm6,%%xmm1 \n" |
michael@0 | 5275 | "addps %%xmm4,%%xmm2 \n" |
michael@0 | 5276 | "movq %%xmm1," MEMACCESS(2) " \n" |
michael@0 | 5277 | "movd %%xmm0,%k1 \n" |
michael@0 | 5278 | "pshufd $0x39,%%xmm0,%%xmm0 \n" |
michael@0 | 5279 | "movd %%xmm0,%k5 \n" |
michael@0 | 5280 | BUNDLEALIGN |
michael@0 | 5281 | MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 |
michael@0 | 5282 | MEMOPREG(movd,0x00,0,5,1,xmm6) // movd (%0,%5,1),%%xmm6 |
michael@0 | 5283 | "punpckldq %%xmm6,%%xmm0 \n" |
michael@0 | 5284 | "addps %%xmm4,%%xmm3 \n" |
michael@0 | 5285 | "sub $0x4,%4 \n" |
michael@0 | 5286 | "movq %%xmm0," MEMACCESS2(0x08,2) " \n" |
michael@0 | 5287 | "lea " MEMLEA(0x10,2) ",%2 \n" |
michael@0 | 5288 | "jge 40b \n" |
michael@0 | 5289 | |
michael@0 | 5290 | "49: \n" |
michael@0 | 5291 | "add $0x3,%4 \n" |
michael@0 | 5292 | "jl 19f \n" |
michael@0 | 5293 | |
michael@0 | 5294 | // 1 pixel loop \n" |
michael@0 | 5295 | LABELALIGN |
michael@0 | 5296 | "10: \n" |
michael@0 | 5297 | "cvttps2dq %%xmm2,%%xmm0 \n" |
michael@0 | 5298 | "packssdw %%xmm0,%%xmm0 \n" |
michael@0 | 5299 | "pmaddwd %%xmm5,%%xmm0 \n" |
michael@0 | 5300 | "addps %%xmm7,%%xmm2 \n" |
michael@0 | 5301 | "movd %%xmm0,%k1 \n" |
michael@0 | 5302 | BUNDLEALIGN |
michael@0 | 5303 | MEMOPREG(movd,0x00,0,1,1,xmm0) // movd (%0,%1,1),%%xmm0 |
michael@0 | 5304 | "sub $0x1,%4 \n" |
michael@0 | 5305 | "movd %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 5306 | "lea " MEMLEA(0x04,2) ",%2 \n" |
michael@0 | 5307 | "jge 10b \n" |
michael@0 | 5308 | "19: \n" |
michael@0 | 5309 | : "+r"(src_argb), // %0 |
michael@0 | 5310 | "+r"(src_argb_stride_temp), // %1 |
michael@0 | 5311 | "+r"(dst_argb), // %2 |
michael@0 | 5312 | "+r"(src_dudv), // %3 |
michael@0 | 5313 | "+rm"(width), // %4 |
michael@0 | 5314 | "+r"(temp) // %5 |
michael@0 | 5315 | : |
michael@0 | 5316 | : "memory", "cc" |
michael@0 | 5317 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 5318 | , "r14" |
michael@0 | 5319 | #endif |
michael@0 | 5320 | #if defined(__SSE2__) |
michael@0 | 5321 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
michael@0 | 5322 | #endif |
michael@0 | 5323 | ); |
michael@0 | 5324 | } |
michael@0 | 5325 | #endif // HAS_ARGBAFFINEROW_SSE2 |
michael@0 | 5326 | |
michael@0 | 5327 | #ifdef HAS_INTERPOLATEROW_SSSE3 |
michael@0 | 5328 | // Bilinear filter 16x2 -> 16x1 |
michael@0 | 5329 | void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
michael@0 | 5330 | ptrdiff_t src_stride, int dst_width, |
michael@0 | 5331 | int source_y_fraction) { |
michael@0 | 5332 | asm volatile ( |
michael@0 | 5333 | "sub %1,%0 \n" |
michael@0 | 5334 | "shr %3 \n" |
michael@0 | 5335 | "cmp $0x0,%3 \n" |
michael@0 | 5336 | "je 100f \n" |
michael@0 | 5337 | "cmp $0x20,%3 \n" |
michael@0 | 5338 | "je 75f \n" |
michael@0 | 5339 | "cmp $0x40,%3 \n" |
michael@0 | 5340 | "je 50f \n" |
michael@0 | 5341 | "cmp $0x60,%3 \n" |
michael@0 | 5342 | "je 25f \n" |
michael@0 | 5343 | |
michael@0 | 5344 | "movd %3,%%xmm0 \n" |
michael@0 | 5345 | "neg %3 \n" |
michael@0 | 5346 | "add $0x80,%3 \n" |
michael@0 | 5347 | "movd %3,%%xmm5 \n" |
michael@0 | 5348 | "punpcklbw %%xmm0,%%xmm5 \n" |
michael@0 | 5349 | "punpcklwd %%xmm5,%%xmm5 \n" |
michael@0 | 5350 | "pshufd $0x0,%%xmm5,%%xmm5 \n" |
michael@0 | 5351 | |
michael@0 | 5352 | // General purpose row blend. |
michael@0 | 5353 | LABELALIGN |
michael@0 | 5354 | "1: \n" |
michael@0 | 5355 | "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5356 | MEMOPREG(movdqa,0x00,1,4,1,xmm2) |
michael@0 | 5357 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 5358 | "punpcklbw %%xmm2,%%xmm0 \n" |
michael@0 | 5359 | "punpckhbw %%xmm2,%%xmm1 \n" |
michael@0 | 5360 | "pmaddubsw %%xmm5,%%xmm0 \n" |
michael@0 | 5361 | "pmaddubsw %%xmm5,%%xmm1 \n" |
michael@0 | 5362 | "psrlw $0x7,%%xmm0 \n" |
michael@0 | 5363 | "psrlw $0x7,%%xmm1 \n" |
michael@0 | 5364 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 5365 | "sub $0x10,%2 \n" |
michael@0 | 5366 | BUNDLEALIGN |
michael@0 | 5367 | MEMOPMEM(movdqa,xmm0,0x00,1,0,1) |
michael@0 | 5368 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5369 | "jg 1b \n" |
michael@0 | 5370 | "jmp 99f \n" |
michael@0 | 5371 | |
michael@0 | 5372 | // Blend 25 / 75. |
michael@0 | 5373 | LABELALIGN |
michael@0 | 5374 | "25: \n" |
michael@0 | 5375 | "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5376 | MEMOPREG(movdqa,0x00,1,4,1,xmm1) |
michael@0 | 5377 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5378 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5379 | "sub $0x10,%2 \n" |
michael@0 | 5380 | BUNDLEALIGN |
michael@0 | 5381 | MEMOPMEM(movdqa,xmm0,0x00,1,0,1) |
michael@0 | 5382 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5383 | "jg 25b \n" |
michael@0 | 5384 | "jmp 99f \n" |
michael@0 | 5385 | |
michael@0 | 5386 | // Blend 50 / 50. |
michael@0 | 5387 | LABELALIGN |
michael@0 | 5388 | "50: \n" |
michael@0 | 5389 | "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5390 | MEMOPREG(movdqa,0x00,1,4,1,xmm1) |
michael@0 | 5391 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5392 | "sub $0x10,%2 \n" |
michael@0 | 5393 | BUNDLEALIGN |
michael@0 | 5394 | MEMOPMEM(movdqa,xmm0,0x00,1,0,1) |
michael@0 | 5395 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5396 | "jg 50b \n" |
michael@0 | 5397 | "jmp 99f \n" |
michael@0 | 5398 | |
michael@0 | 5399 | // Blend 75 / 25. |
michael@0 | 5400 | LABELALIGN |
michael@0 | 5401 | "75: \n" |
michael@0 | 5402 | "movdqa " MEMACCESS(1) ",%%xmm1 \n" |
michael@0 | 5403 | MEMOPREG(movdqa,0x00,1,4,1,xmm0) |
michael@0 | 5404 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5405 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5406 | "sub $0x10,%2 \n" |
michael@0 | 5407 | BUNDLEALIGN |
michael@0 | 5408 | MEMOPMEM(movdqa,xmm0,0x00,1,0,1) |
michael@0 | 5409 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5410 | "jg 75b \n" |
michael@0 | 5411 | "jmp 99f \n" |
michael@0 | 5412 | |
michael@0 | 5413 | // Blend 100 / 0 - Copy row unchanged. |
michael@0 | 5414 | LABELALIGN |
michael@0 | 5415 | "100: \n" |
michael@0 | 5416 | "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5417 | "sub $0x10,%2 \n" |
michael@0 | 5418 | MEMOPMEM(movdqa,xmm0,0x00,1,0,1) |
michael@0 | 5419 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5420 | "jg 100b \n" |
michael@0 | 5421 | |
michael@0 | 5422 | "99: \n" |
michael@0 | 5423 | : "+r"(dst_ptr), // %0 |
michael@0 | 5424 | "+r"(src_ptr), // %1 |
michael@0 | 5425 | "+r"(dst_width), // %2 |
michael@0 | 5426 | "+r"(source_y_fraction) // %3 |
michael@0 | 5427 | : "r"((intptr_t)(src_stride)) // %4 |
michael@0 | 5428 | : "memory", "cc" |
michael@0 | 5429 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 5430 | , "r14" |
michael@0 | 5431 | #endif |
michael@0 | 5432 | #if defined(__SSE2__) |
michael@0 | 5433 | , "xmm0", "xmm1", "xmm2", "xmm5" |
michael@0 | 5434 | #endif |
michael@0 | 5435 | ); |
michael@0 | 5436 | } |
michael@0 | 5437 | #endif // HAS_INTERPOLATEROW_SSSE3 |
michael@0 | 5438 | |
michael@0 | 5439 | #ifdef HAS_INTERPOLATEROW_SSE2 |
michael@0 | 5440 | // Bilinear filter 16x2 -> 16x1 |
michael@0 | 5441 | void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
michael@0 | 5442 | ptrdiff_t src_stride, int dst_width, |
michael@0 | 5443 | int source_y_fraction) { |
michael@0 | 5444 | asm volatile ( |
michael@0 | 5445 | "sub %1,%0 \n" |
michael@0 | 5446 | "shr %3 \n" |
michael@0 | 5447 | "cmp $0x0,%3 \n" |
michael@0 | 5448 | "je 100f \n" |
michael@0 | 5449 | "cmp $0x20,%3 \n" |
michael@0 | 5450 | "je 75f \n" |
michael@0 | 5451 | "cmp $0x40,%3 \n" |
michael@0 | 5452 | "je 50f \n" |
michael@0 | 5453 | "cmp $0x60,%3 \n" |
michael@0 | 5454 | "je 25f \n" |
michael@0 | 5455 | |
michael@0 | 5456 | "movd %3,%%xmm0 \n" |
michael@0 | 5457 | "neg %3 \n" |
michael@0 | 5458 | "add $0x80,%3 \n" |
michael@0 | 5459 | "movd %3,%%xmm5 \n" |
michael@0 | 5460 | "punpcklbw %%xmm0,%%xmm5 \n" |
michael@0 | 5461 | "punpcklwd %%xmm5,%%xmm5 \n" |
michael@0 | 5462 | "pshufd $0x0,%%xmm5,%%xmm5 \n" |
michael@0 | 5463 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 5464 | |
michael@0 | 5465 | // General purpose row blend. |
michael@0 | 5466 | LABELALIGN |
michael@0 | 5467 | "1: \n" |
michael@0 | 5468 | "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5469 | MEMOPREG(movdqa,0x00,1,4,1,xmm2) // movdqa (%1,%4,1),%%xmm2 |
michael@0 | 5470 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 5471 | "movdqa %%xmm2,%%xmm3 \n" |
michael@0 | 5472 | "punpcklbw %%xmm4,%%xmm2 \n" |
michael@0 | 5473 | "punpckhbw %%xmm4,%%xmm3 \n" |
michael@0 | 5474 | "punpcklbw %%xmm4,%%xmm0 \n" |
michael@0 | 5475 | "punpckhbw %%xmm4,%%xmm1 \n" |
michael@0 | 5476 | "psubw %%xmm0,%%xmm2 \n" |
michael@0 | 5477 | "psubw %%xmm1,%%xmm3 \n" |
michael@0 | 5478 | "paddw %%xmm2,%%xmm2 \n" |
michael@0 | 5479 | "paddw %%xmm3,%%xmm3 \n" |
michael@0 | 5480 | "pmulhw %%xmm5,%%xmm2 \n" |
michael@0 | 5481 | "pmulhw %%xmm5,%%xmm3 \n" |
michael@0 | 5482 | "paddw %%xmm2,%%xmm0 \n" |
michael@0 | 5483 | "paddw %%xmm3,%%xmm1 \n" |
michael@0 | 5484 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 5485 | "sub $0x10,%2 \n" |
michael@0 | 5486 | BUNDLEALIGN |
michael@0 | 5487 | MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) |
michael@0 | 5488 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5489 | "jg 1b \n" |
michael@0 | 5490 | "jmp 99f \n" |
michael@0 | 5491 | |
michael@0 | 5492 | // Blend 25 / 75. |
michael@0 | 5493 | LABELALIGN |
michael@0 | 5494 | "25: \n" |
michael@0 | 5495 | "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5496 | MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1 |
michael@0 | 5497 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5498 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5499 | "sub $0x10,%2 \n" |
michael@0 | 5500 | BUNDLEALIGN |
michael@0 | 5501 | MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) |
michael@0 | 5502 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5503 | "jg 25b \n" |
michael@0 | 5504 | "jmp 99f \n" |
michael@0 | 5505 | |
michael@0 | 5506 | // Blend 50 / 50. |
michael@0 | 5507 | LABELALIGN |
michael@0 | 5508 | "50: \n" |
michael@0 | 5509 | "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5510 | MEMOPREG(movdqa,0x00,1,4,1,xmm1) // movdqa (%1,%4,1),%%xmm1 |
michael@0 | 5511 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5512 | "sub $0x10,%2 \n" |
michael@0 | 5513 | BUNDLEALIGN |
michael@0 | 5514 | MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) |
michael@0 | 5515 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5516 | "jg 50b \n" |
michael@0 | 5517 | "jmp 99f \n" |
michael@0 | 5518 | |
michael@0 | 5519 | // Blend 75 / 25. |
michael@0 | 5520 | LABELALIGN |
michael@0 | 5521 | "75: \n" |
michael@0 | 5522 | "movdqa " MEMACCESS(1) ",%%xmm1 \n" |
michael@0 | 5523 | MEMOPREG(movdqa,0x00,1,4,1,xmm0) // movdqa (%1,%4,1),%%xmm0 |
michael@0 | 5524 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5525 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5526 | "sub $0x10,%2 \n" |
michael@0 | 5527 | BUNDLEALIGN |
michael@0 | 5528 | MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) |
michael@0 | 5529 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5530 | "jg 75b \n" |
michael@0 | 5531 | "jmp 99f \n" |
michael@0 | 5532 | |
michael@0 | 5533 | // Blend 100 / 0 - Copy row unchanged. |
michael@0 | 5534 | LABELALIGN |
michael@0 | 5535 | "100: \n" |
michael@0 | 5536 | "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5537 | "sub $0x10,%2 \n" |
michael@0 | 5538 | MEMOPMEM(movdqa,xmm0,0x00,1,0,1) // movdqa %%xmm0,(%1,%0,1) |
michael@0 | 5539 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5540 | "jg 100b \n" |
michael@0 | 5541 | |
michael@0 | 5542 | "99: \n" |
michael@0 | 5543 | : "+r"(dst_ptr), // %0 |
michael@0 | 5544 | "+r"(src_ptr), // %1 |
michael@0 | 5545 | "+r"(dst_width), // %2 |
michael@0 | 5546 | "+r"(source_y_fraction) // %3 |
michael@0 | 5547 | : "r"((intptr_t)(src_stride)) // %4 |
michael@0 | 5548 | : "memory", "cc" |
michael@0 | 5549 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 5550 | , "r14" |
michael@0 | 5551 | #endif |
michael@0 | 5552 | #if defined(__SSE2__) |
michael@0 | 5553 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 5554 | #endif |
michael@0 | 5555 | ); |
michael@0 | 5556 | } |
michael@0 | 5557 | #endif // HAS_INTERPOLATEROW_SSE2 |
michael@0 | 5558 | |
michael@0 | 5559 | #ifdef HAS_INTERPOLATEROW_SSSE3 |
michael@0 | 5560 | // Bilinear filter 16x2 -> 16x1 |
michael@0 | 5561 | void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
michael@0 | 5562 | ptrdiff_t src_stride, int dst_width, |
michael@0 | 5563 | int source_y_fraction) { |
michael@0 | 5564 | asm volatile ( |
michael@0 | 5565 | "sub %1,%0 \n" |
michael@0 | 5566 | "shr %3 \n" |
michael@0 | 5567 | "cmp $0x0,%3 \n" |
michael@0 | 5568 | "je 100f \n" |
michael@0 | 5569 | "cmp $0x20,%3 \n" |
michael@0 | 5570 | "je 75f \n" |
michael@0 | 5571 | "cmp $0x40,%3 \n" |
michael@0 | 5572 | "je 50f \n" |
michael@0 | 5573 | "cmp $0x60,%3 \n" |
michael@0 | 5574 | "je 25f \n" |
michael@0 | 5575 | |
michael@0 | 5576 | "movd %3,%%xmm0 \n" |
michael@0 | 5577 | "neg %3 \n" |
michael@0 | 5578 | "add $0x80,%3 \n" |
michael@0 | 5579 | "movd %3,%%xmm5 \n" |
michael@0 | 5580 | "punpcklbw %%xmm0,%%xmm5 \n" |
michael@0 | 5581 | "punpcklwd %%xmm5,%%xmm5 \n" |
michael@0 | 5582 | "pshufd $0x0,%%xmm5,%%xmm5 \n" |
michael@0 | 5583 | |
michael@0 | 5584 | // General purpose row blend. |
michael@0 | 5585 | LABELALIGN |
michael@0 | 5586 | "1: \n" |
michael@0 | 5587 | "movdqu " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5588 | MEMOPREG(movdqu,0x00,1,4,1,xmm2) |
michael@0 | 5589 | "movdqu %%xmm0,%%xmm1 \n" |
michael@0 | 5590 | "punpcklbw %%xmm2,%%xmm0 \n" |
michael@0 | 5591 | "punpckhbw %%xmm2,%%xmm1 \n" |
michael@0 | 5592 | "pmaddubsw %%xmm5,%%xmm0 \n" |
michael@0 | 5593 | "pmaddubsw %%xmm5,%%xmm1 \n" |
michael@0 | 5594 | "psrlw $0x7,%%xmm0 \n" |
michael@0 | 5595 | "psrlw $0x7,%%xmm1 \n" |
michael@0 | 5596 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 5597 | "sub $0x10,%2 \n" |
michael@0 | 5598 | BUNDLEALIGN |
michael@0 | 5599 | MEMOPMEM(movdqu,xmm0,0x00,1,0,1) |
michael@0 | 5600 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5601 | "jg 1b \n" |
michael@0 | 5602 | "jmp 99f \n" |
michael@0 | 5603 | |
michael@0 | 5604 | // Blend 25 / 75. |
michael@0 | 5605 | LABELALIGN |
michael@0 | 5606 | "25: \n" |
michael@0 | 5607 | "movdqu " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5608 | MEMOPREG(movdqu,0x00,1,4,1,xmm1) |
michael@0 | 5609 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5610 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5611 | "sub $0x10,%2 \n" |
michael@0 | 5612 | BUNDLEALIGN |
michael@0 | 5613 | MEMOPMEM(movdqu,xmm0,0x00,1,0,1) |
michael@0 | 5614 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5615 | "jg 25b \n" |
michael@0 | 5616 | "jmp 99f \n" |
michael@0 | 5617 | |
michael@0 | 5618 | // Blend 50 / 50. |
michael@0 | 5619 | LABELALIGN |
michael@0 | 5620 | "50: \n" |
michael@0 | 5621 | "movdqu " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5622 | MEMOPREG(movdqu,0x00,1,4,1,xmm1) |
michael@0 | 5623 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5624 | "sub $0x10,%2 \n" |
michael@0 | 5625 | BUNDLEALIGN |
michael@0 | 5626 | MEMOPMEM(movdqu,xmm0,0x00,1,0,1) |
michael@0 | 5627 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5628 | "jg 50b \n" |
michael@0 | 5629 | "jmp 99f \n" |
michael@0 | 5630 | |
michael@0 | 5631 | // Blend 75 / 25. |
michael@0 | 5632 | LABELALIGN |
michael@0 | 5633 | "75: \n" |
michael@0 | 5634 | "movdqu " MEMACCESS(1) ",%%xmm1 \n" |
michael@0 | 5635 | MEMOPREG(movdqu,0x00,1,4,1,xmm0) |
michael@0 | 5636 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5637 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5638 | "sub $0x10,%2 \n" |
michael@0 | 5639 | BUNDLEALIGN |
michael@0 | 5640 | MEMOPMEM(movdqu,xmm0,0x00,1,0,1) |
michael@0 | 5641 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5642 | "jg 75b \n" |
michael@0 | 5643 | "jmp 99f \n" |
michael@0 | 5644 | |
michael@0 | 5645 | // Blend 100 / 0 - Copy row unchanged. |
michael@0 | 5646 | LABELALIGN |
michael@0 | 5647 | "100: \n" |
michael@0 | 5648 | "movdqu " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5649 | "sub $0x10,%2 \n" |
michael@0 | 5650 | MEMOPMEM(movdqu,xmm0,0x00,1,0,1) |
michael@0 | 5651 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5652 | "jg 100b \n" |
michael@0 | 5653 | |
michael@0 | 5654 | "99: \n" |
michael@0 | 5655 | : "+r"(dst_ptr), // %0 |
michael@0 | 5656 | "+r"(src_ptr), // %1 |
michael@0 | 5657 | "+r"(dst_width), // %2 |
michael@0 | 5658 | "+r"(source_y_fraction) // %3 |
michael@0 | 5659 | : "r"((intptr_t)(src_stride)) // %4 |
michael@0 | 5660 | : "memory", "cc" |
michael@0 | 5661 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 5662 | , "r14" |
michael@0 | 5663 | #endif |
michael@0 | 5664 | #if defined(__SSE2__) |
michael@0 | 5665 | , "xmm0", "xmm1", "xmm2", "xmm5" |
michael@0 | 5666 | #endif |
michael@0 | 5667 | ); |
michael@0 | 5668 | } |
michael@0 | 5669 | #endif // HAS_INTERPOLATEROW_SSSE3 |
michael@0 | 5670 | |
michael@0 | 5671 | #ifdef HAS_INTERPOLATEROW_SSE2 |
michael@0 | 5672 | // Bilinear filter 16x2 -> 16x1 |
michael@0 | 5673 | void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
michael@0 | 5674 | ptrdiff_t src_stride, int dst_width, |
michael@0 | 5675 | int source_y_fraction) { |
michael@0 | 5676 | asm volatile ( |
michael@0 | 5677 | "sub %1,%0 \n" |
michael@0 | 5678 | "shr %3 \n" |
michael@0 | 5679 | "cmp $0x0,%3 \n" |
michael@0 | 5680 | "je 100f \n" |
michael@0 | 5681 | "cmp $0x20,%3 \n" |
michael@0 | 5682 | "je 75f \n" |
michael@0 | 5683 | "cmp $0x40,%3 \n" |
michael@0 | 5684 | "je 50f \n" |
michael@0 | 5685 | "cmp $0x60,%3 \n" |
michael@0 | 5686 | "je 25f \n" |
michael@0 | 5687 | |
michael@0 | 5688 | "movd %3,%%xmm0 \n" |
michael@0 | 5689 | "neg %3 \n" |
michael@0 | 5690 | "add $0x80,%3 \n" |
michael@0 | 5691 | "movd %3,%%xmm5 \n" |
michael@0 | 5692 | "punpcklbw %%xmm0,%%xmm5 \n" |
michael@0 | 5693 | "punpcklwd %%xmm5,%%xmm5 \n" |
michael@0 | 5694 | "pshufd $0x0,%%xmm5,%%xmm5 \n" |
michael@0 | 5695 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 5696 | |
michael@0 | 5697 | // General purpose row blend. |
michael@0 | 5698 | LABELALIGN |
michael@0 | 5699 | "1: \n" |
michael@0 | 5700 | "movdqu " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5701 | MEMOPREG(movdqu,0x00,1,4,1,xmm2) // movdqu (%1,%4,1),%%xmm2 |
michael@0 | 5702 | "movdqu %%xmm0,%%xmm1 \n" |
michael@0 | 5703 | "movdqu %%xmm2,%%xmm3 \n" |
michael@0 | 5704 | "punpcklbw %%xmm4,%%xmm2 \n" |
michael@0 | 5705 | "punpckhbw %%xmm4,%%xmm3 \n" |
michael@0 | 5706 | "punpcklbw %%xmm4,%%xmm0 \n" |
michael@0 | 5707 | "punpckhbw %%xmm4,%%xmm1 \n" |
michael@0 | 5708 | "psubw %%xmm0,%%xmm2 \n" |
michael@0 | 5709 | "psubw %%xmm1,%%xmm3 \n" |
michael@0 | 5710 | "paddw %%xmm2,%%xmm2 \n" |
michael@0 | 5711 | "paddw %%xmm3,%%xmm3 \n" |
michael@0 | 5712 | "pmulhw %%xmm5,%%xmm2 \n" |
michael@0 | 5713 | "pmulhw %%xmm5,%%xmm3 \n" |
michael@0 | 5714 | "paddw %%xmm2,%%xmm0 \n" |
michael@0 | 5715 | "paddw %%xmm3,%%xmm1 \n" |
michael@0 | 5716 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 5717 | "sub $0x10,%2 \n" |
michael@0 | 5718 | BUNDLEALIGN |
michael@0 | 5719 | MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) |
michael@0 | 5720 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5721 | "jg 1b \n" |
michael@0 | 5722 | "jmp 99f \n" |
michael@0 | 5723 | |
michael@0 | 5724 | // Blend 25 / 75. |
michael@0 | 5725 | LABELALIGN |
michael@0 | 5726 | "25: \n" |
michael@0 | 5727 | "movdqu " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5728 | MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 |
michael@0 | 5729 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5730 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5731 | "sub $0x10,%2 \n" |
michael@0 | 5732 | BUNDLEALIGN |
michael@0 | 5733 | MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) |
michael@0 | 5734 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5735 | "jg 25b \n" |
michael@0 | 5736 | "jmp 99f \n" |
michael@0 | 5737 | |
michael@0 | 5738 | // Blend 50 / 50. |
michael@0 | 5739 | LABELALIGN |
michael@0 | 5740 | "50: \n" |
michael@0 | 5741 | "movdqu " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5742 | MEMOPREG(movdqu,0x00,1,4,1,xmm1) // movdqu (%1,%4,1),%%xmm1 |
michael@0 | 5743 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5744 | "sub $0x10,%2 \n" |
michael@0 | 5745 | BUNDLEALIGN |
michael@0 | 5746 | MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) |
michael@0 | 5747 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5748 | "jg 50b \n" |
michael@0 | 5749 | "jmp 99f \n" |
michael@0 | 5750 | |
michael@0 | 5751 | // Blend 75 / 25. |
michael@0 | 5752 | LABELALIGN |
michael@0 | 5753 | "75: \n" |
michael@0 | 5754 | "movdqu " MEMACCESS(1) ",%%xmm1 \n" |
michael@0 | 5755 | MEMOPREG(movdqu,0x00,1,4,1,xmm0) // movdqu (%1,%4,1),%%xmm0 |
michael@0 | 5756 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5757 | "pavgb %%xmm1,%%xmm0 \n" |
michael@0 | 5758 | "sub $0x10,%2 \n" |
michael@0 | 5759 | BUNDLEALIGN |
michael@0 | 5760 | MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) |
michael@0 | 5761 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5762 | "jg 75b \n" |
michael@0 | 5763 | "jmp 99f \n" |
michael@0 | 5764 | |
michael@0 | 5765 | // Blend 100 / 0 - Copy row unchanged. |
michael@0 | 5766 | LABELALIGN |
michael@0 | 5767 | "100: \n" |
michael@0 | 5768 | "movdqu " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 5769 | "sub $0x10,%2 \n" |
michael@0 | 5770 | MEMOPMEM(movdqu,xmm0,0x00,1,0,1) // movdqu %%xmm0,(%1,%0,1) |
michael@0 | 5771 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 5772 | "jg 100b \n" |
michael@0 | 5773 | |
michael@0 | 5774 | "99: \n" |
michael@0 | 5775 | : "+r"(dst_ptr), // %0 |
michael@0 | 5776 | "+r"(src_ptr), // %1 |
michael@0 | 5777 | "+r"(dst_width), // %2 |
michael@0 | 5778 | "+r"(source_y_fraction) // %3 |
michael@0 | 5779 | : "r"((intptr_t)(src_stride)) // %4 |
michael@0 | 5780 | : "memory", "cc" |
michael@0 | 5781 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 5782 | , "r14" |
michael@0 | 5783 | #endif |
michael@0 | 5784 | #if defined(__SSE2__) |
michael@0 | 5785 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 5786 | #endif |
michael@0 | 5787 | ); |
michael@0 | 5788 | } |
michael@0 | 5789 | #endif // HAS_INTERPOLATEROW_SSE2 |
michael@0 | 5790 | |
michael@0 | 5791 | #ifdef HAS_HALFROW_SSE2 |
michael@0 | 5792 | void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride, |
michael@0 | 5793 | uint8* dst_uv, int pix) { |
michael@0 | 5794 | asm volatile ( |
michael@0 | 5795 | "sub %0,%1 \n" |
michael@0 | 5796 | LABELALIGN |
michael@0 | 5797 | "1: \n" |
michael@0 | 5798 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 5799 | MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3),%%xmm0 |
michael@0 | 5800 | "sub $0x10,%2 \n" |
michael@0 | 5801 | MEMOPMEM(movdqa,xmm0,0x00,0,1,1) // movdqa %%xmm0,(%0,%1) |
michael@0 | 5802 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 5803 | "jg 1b \n" |
michael@0 | 5804 | : "+r"(src_uv), // %0 |
michael@0 | 5805 | "+r"(dst_uv), // %1 |
michael@0 | 5806 | "+r"(pix) // %2 |
michael@0 | 5807 | : "r"((intptr_t)(src_uv_stride)) // %3 |
michael@0 | 5808 | : "memory", "cc" |
michael@0 | 5809 | #if defined(__SSE2__) |
michael@0 | 5810 | , "xmm0" |
michael@0 | 5811 | #endif |
michael@0 | 5812 | ); |
michael@0 | 5813 | } |
michael@0 | 5814 | #endif // HAS_HALFROW_SSE2 |
michael@0 | 5815 | |
michael@0 | 5816 | #ifdef HAS_ARGBTOBAYERROW_SSSE3 |
michael@0 | 5817 | void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, |
michael@0 | 5818 | uint32 selector, int pix) { |
michael@0 | 5819 | asm volatile ( |
michael@0 | 5820 | // NaCL caveat - assumes movd is from GPR |
michael@0 | 5821 | "movd %3,%%xmm5 \n" |
michael@0 | 5822 | "pshufd $0x0,%%xmm5,%%xmm5 \n" |
michael@0 | 5823 | LABELALIGN |
michael@0 | 5824 | "1: \n" |
michael@0 | 5825 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 5826 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 5827 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 5828 | "pshufb %%xmm5,%%xmm0 \n" |
michael@0 | 5829 | "pshufb %%xmm5,%%xmm1 \n" |
michael@0 | 5830 | "punpckldq %%xmm1,%%xmm0 \n" |
michael@0 | 5831 | "sub $0x8,%2 \n" |
michael@0 | 5832 | "movq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 5833 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 5834 | "jg 1b \n" |
michael@0 | 5835 | : "+r"(src_argb), // %0 |
michael@0 | 5836 | "+r"(dst_bayer), // %1 |
michael@0 | 5837 | "+r"(pix) // %2 |
michael@0 | 5838 | : "g"(selector) // %3 |
michael@0 | 5839 | : "memory", "cc" |
michael@0 | 5840 | #if defined(__SSE2__) |
michael@0 | 5841 | , "xmm0", "xmm1", "xmm5" |
michael@0 | 5842 | #endif |
michael@0 | 5843 | ); |
michael@0 | 5844 | } |
michael@0 | 5845 | #endif // HAS_ARGBTOBAYERROW_SSSE3 |
michael@0 | 5846 | |
michael@0 | 5847 | #ifdef HAS_ARGBTOBAYERGGROW_SSE2 |
michael@0 | 5848 | void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, |
michael@0 | 5849 | uint32 selector, int pix) { |
michael@0 | 5850 | asm volatile ( |
michael@0 | 5851 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 5852 | "psrld $0x18,%%xmm5 \n" |
michael@0 | 5853 | LABELALIGN |
michael@0 | 5854 | "1: \n" |
michael@0 | 5855 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 5856 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 5857 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 5858 | "psrld $0x8,%%xmm0 \n" |
michael@0 | 5859 | "psrld $0x8,%%xmm1 \n" |
michael@0 | 5860 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 5861 | "pand %%xmm5,%%xmm1 \n" |
michael@0 | 5862 | "packssdw %%xmm1,%%xmm0 \n" |
michael@0 | 5863 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 5864 | "sub $0x8,%2 \n" |
michael@0 | 5865 | "movq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 5866 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 5867 | "jg 1b \n" |
michael@0 | 5868 | : "+r"(src_argb), // %0 |
michael@0 | 5869 | "+r"(dst_bayer), // %1 |
michael@0 | 5870 | "+r"(pix) // %2 |
michael@0 | 5871 | : |
michael@0 | 5872 | : "memory", "cc" |
michael@0 | 5873 | #if defined(__SSE2__) |
michael@0 | 5874 | , "xmm0", "xmm1", "xmm5" |
michael@0 | 5875 | #endif |
michael@0 | 5876 | ); |
michael@0 | 5877 | } |
michael@0 | 5878 | #endif // HAS_ARGBTOBAYERGGROW_SSE2 |
michael@0 | 5879 | |
michael@0 | 5880 | #ifdef HAS_ARGBSHUFFLEROW_SSSE3 |
michael@0 | 5881 | // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
michael@0 | 5882 | void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
michael@0 | 5883 | const uint8* shuffler, int pix) { |
michael@0 | 5884 | asm volatile ( |
michael@0 | 5885 | "movdqa " MEMACCESS(3) ",%%xmm5 \n" |
michael@0 | 5886 | LABELALIGN |
michael@0 | 5887 | "1: \n" |
michael@0 | 5888 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 5889 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 5890 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 5891 | "pshufb %%xmm5,%%xmm0 \n" |
michael@0 | 5892 | "pshufb %%xmm5,%%xmm1 \n" |
michael@0 | 5893 | "sub $0x8,%2 \n" |
michael@0 | 5894 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 5895 | "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" |
michael@0 | 5896 | "lea " MEMLEA(0x20,1) ",%1 \n" |
michael@0 | 5897 | "jg 1b \n" |
michael@0 | 5898 | : "+r"(src_argb), // %0 |
michael@0 | 5899 | "+r"(dst_argb), // %1 |
michael@0 | 5900 | "+r"(pix) // %2 |
michael@0 | 5901 | : "r"(shuffler) // %3 |
michael@0 | 5902 | : "memory", "cc" |
michael@0 | 5903 | #if defined(__SSE2__) |
michael@0 | 5904 | , "xmm0", "xmm1", "xmm5" |
michael@0 | 5905 | #endif |
michael@0 | 5906 | ); |
michael@0 | 5907 | } |
michael@0 | 5908 | |
michael@0 | 5909 | void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb, |
michael@0 | 5910 | const uint8* shuffler, int pix) { |
michael@0 | 5911 | asm volatile ( |
michael@0 | 5912 | "movdqa " MEMACCESS(3) ",%%xmm5 \n" |
michael@0 | 5913 | LABELALIGN |
michael@0 | 5914 | "1: \n" |
michael@0 | 5915 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 5916 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 5917 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 5918 | "pshufb %%xmm5,%%xmm0 \n" |
michael@0 | 5919 | "pshufb %%xmm5,%%xmm1 \n" |
michael@0 | 5920 | "sub $0x8,%2 \n" |
michael@0 | 5921 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 5922 | "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
michael@0 | 5923 | "lea " MEMLEA(0x20,1) ",%1 \n" |
michael@0 | 5924 | "jg 1b \n" |
michael@0 | 5925 | : "+r"(src_argb), // %0 |
michael@0 | 5926 | "+r"(dst_argb), // %1 |
michael@0 | 5927 | "+r"(pix) // %2 |
michael@0 | 5928 | : "r"(shuffler) // %3 |
michael@0 | 5929 | : "memory", "cc" |
michael@0 | 5930 | #if defined(__SSE2__) |
michael@0 | 5931 | , "xmm0", "xmm1", "xmm5" |
michael@0 | 5932 | #endif |
michael@0 | 5933 | ); |
michael@0 | 5934 | } |
michael@0 | 5935 | #endif // HAS_ARGBSHUFFLEROW_SSSE3 |
michael@0 | 5936 | |
michael@0 | 5937 | #ifdef HAS_ARGBSHUFFLEROW_AVX2 |
michael@0 | 5938 | // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
michael@0 | 5939 | void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, |
michael@0 | 5940 | const uint8* shuffler, int pix) { |
michael@0 | 5941 | asm volatile ( |
michael@0 | 5942 | "vbroadcastf128 " MEMACCESS(3) ",%%ymm5 \n" |
michael@0 | 5943 | LABELALIGN |
michael@0 | 5944 | "1: \n" |
michael@0 | 5945 | "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
michael@0 | 5946 | "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
michael@0 | 5947 | "lea " MEMLEA(0x40,0) ",%0 \n" |
michael@0 | 5948 | "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" |
michael@0 | 5949 | "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" |
michael@0 | 5950 | "sub $0x10,%2 \n" |
michael@0 | 5951 | "vmovdqu %%ymm0," MEMACCESS(1) " \n" |
michael@0 | 5952 | "vmovdqu %%ymm1," MEMACCESS2(0x20,1) " \n" |
michael@0 | 5953 | "lea " MEMLEA(0x40,1) ",%1 \n" |
michael@0 | 5954 | "jg 1b \n" |
michael@0 | 5955 | : "+r"(src_argb), // %0 |
michael@0 | 5956 | "+r"(dst_argb), // %1 |
michael@0 | 5957 | "+r"(pix) // %2 |
michael@0 | 5958 | : "r"(shuffler) // %3 |
michael@0 | 5959 | : "memory", "cc" |
michael@0 | 5960 | #if defined(__SSE2__) |
michael@0 | 5961 | , "xmm0", "xmm1", "xmm5" |
michael@0 | 5962 | #endif |
michael@0 | 5963 | ); |
michael@0 | 5964 | } |
michael@0 | 5965 | #endif // HAS_ARGBSHUFFLEROW_AVX2 |
michael@0 | 5966 | |
michael@0 | 5967 | #ifdef HAS_ARGBSHUFFLEROW_SSE2 |
michael@0 | 5968 | // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. |
michael@0 | 5969 | void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, |
michael@0 | 5970 | const uint8* shuffler, int pix) { |
michael@0 | 5971 | uintptr_t pixel_temp = 0u; |
michael@0 | 5972 | asm volatile ( |
michael@0 | 5973 | "pxor %%xmm5,%%xmm5 \n" |
michael@0 | 5974 | "mov " MEMACCESS(4) ",%k2 \n" |
michael@0 | 5975 | "cmp $0x3000102,%k2 \n" |
michael@0 | 5976 | "je 3012f \n" |
michael@0 | 5977 | "cmp $0x10203,%k2 \n" |
michael@0 | 5978 | "je 123f \n" |
michael@0 | 5979 | "cmp $0x30201,%k2 \n" |
michael@0 | 5980 | "je 321f \n" |
michael@0 | 5981 | "cmp $0x2010003,%k2 \n" |
michael@0 | 5982 | "je 2103f \n" |
michael@0 | 5983 | |
michael@0 | 5984 | LABELALIGN |
michael@0 | 5985 | "1: \n" |
michael@0 | 5986 | "movzb " MEMACCESS(4) ",%2 \n" |
michael@0 | 5987 | MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 |
michael@0 | 5988 | "mov %b2," MEMACCESS(1) " \n" |
michael@0 | 5989 | "movzb " MEMACCESS2(0x1,4) ",%2 \n" |
michael@0 | 5990 | MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 |
michael@0 | 5991 | "mov %b2," MEMACCESS2(0x1,1) " \n" |
michael@0 | 5992 | BUNDLEALIGN |
michael@0 | 5993 | "movzb " MEMACCESS2(0x2,4) ",%2 \n" |
michael@0 | 5994 | MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 |
michael@0 | 5995 | "mov %b2," MEMACCESS2(0x2,1) " \n" |
michael@0 | 5996 | "movzb " MEMACCESS2(0x3,4) ",%2 \n" |
michael@0 | 5997 | MEMOPARG(movzb,0x00,0,2,1,2) " \n" // movzb (%0,%2,1),%2 |
michael@0 | 5998 | "mov %b2," MEMACCESS2(0x3,1) " \n" |
michael@0 | 5999 | "lea " MEMLEA(0x4,0) ",%0 \n" |
michael@0 | 6000 | "lea " MEMLEA(0x4,1) ",%1 \n" |
michael@0 | 6001 | "sub $0x1,%3 \n" |
michael@0 | 6002 | "jg 1b \n" |
michael@0 | 6003 | "jmp 99f \n" |
michael@0 | 6004 | |
michael@0 | 6005 | LABELALIGN |
michael@0 | 6006 | "123: \n" |
michael@0 | 6007 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 6008 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 6009 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 6010 | "punpcklbw %%xmm5,%%xmm0 \n" |
michael@0 | 6011 | "punpckhbw %%xmm5,%%xmm1 \n" |
michael@0 | 6012 | "pshufhw $0x1b,%%xmm0,%%xmm0 \n" |
michael@0 | 6013 | "pshuflw $0x1b,%%xmm0,%%xmm0 \n" |
michael@0 | 6014 | "pshufhw $0x1b,%%xmm1,%%xmm1 \n" |
michael@0 | 6015 | "pshuflw $0x1b,%%xmm1,%%xmm1 \n" |
michael@0 | 6016 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 6017 | "sub $0x4,%3 \n" |
michael@0 | 6018 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 6019 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 6020 | "jg 123b \n" |
michael@0 | 6021 | "jmp 99f \n" |
michael@0 | 6022 | |
michael@0 | 6023 | LABELALIGN |
michael@0 | 6024 | "321: \n" |
michael@0 | 6025 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 6026 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 6027 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 6028 | "punpcklbw %%xmm5,%%xmm0 \n" |
michael@0 | 6029 | "punpckhbw %%xmm5,%%xmm1 \n" |
michael@0 | 6030 | "pshufhw $0x39,%%xmm0,%%xmm0 \n" |
michael@0 | 6031 | "pshuflw $0x39,%%xmm0,%%xmm0 \n" |
michael@0 | 6032 | "pshufhw $0x39,%%xmm1,%%xmm1 \n" |
michael@0 | 6033 | "pshuflw $0x39,%%xmm1,%%xmm1 \n" |
michael@0 | 6034 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 6035 | "sub $0x4,%3 \n" |
michael@0 | 6036 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 6037 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 6038 | "jg 321b \n" |
michael@0 | 6039 | "jmp 99f \n" |
michael@0 | 6040 | |
michael@0 | 6041 | LABELALIGN |
michael@0 | 6042 | "2103: \n" |
michael@0 | 6043 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 6044 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 6045 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 6046 | "punpcklbw %%xmm5,%%xmm0 \n" |
michael@0 | 6047 | "punpckhbw %%xmm5,%%xmm1 \n" |
michael@0 | 6048 | "pshufhw $0x93,%%xmm0,%%xmm0 \n" |
michael@0 | 6049 | "pshuflw $0x93,%%xmm0,%%xmm0 \n" |
michael@0 | 6050 | "pshufhw $0x93,%%xmm1,%%xmm1 \n" |
michael@0 | 6051 | "pshuflw $0x93,%%xmm1,%%xmm1 \n" |
michael@0 | 6052 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 6053 | "sub $0x4,%3 \n" |
michael@0 | 6054 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 6055 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 6056 | "jg 2103b \n" |
michael@0 | 6057 | "jmp 99f \n" |
michael@0 | 6058 | |
michael@0 | 6059 | LABELALIGN |
michael@0 | 6060 | "3012: \n" |
michael@0 | 6061 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 6062 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 6063 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 6064 | "punpcklbw %%xmm5,%%xmm0 \n" |
michael@0 | 6065 | "punpckhbw %%xmm5,%%xmm1 \n" |
michael@0 | 6066 | "pshufhw $0xc6,%%xmm0,%%xmm0 \n" |
michael@0 | 6067 | "pshuflw $0xc6,%%xmm0,%%xmm0 \n" |
michael@0 | 6068 | "pshufhw $0xc6,%%xmm1,%%xmm1 \n" |
michael@0 | 6069 | "pshuflw $0xc6,%%xmm1,%%xmm1 \n" |
michael@0 | 6070 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 6071 | "sub $0x4,%3 \n" |
michael@0 | 6072 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 6073 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 6074 | "jg 3012b \n" |
michael@0 | 6075 | |
michael@0 | 6076 | "99: \n" |
michael@0 | 6077 | : "+r"(src_argb), // %0 |
michael@0 | 6078 | "+r"(dst_argb), // %1 |
michael@0 | 6079 | "+d"(pixel_temp), // %2 |
michael@0 | 6080 | "+r"(pix) // %3 |
michael@0 | 6081 | : "r"(shuffler) // %4 |
michael@0 | 6082 | : "memory", "cc" |
michael@0 | 6083 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 6084 | , "r14" |
michael@0 | 6085 | #endif |
michael@0 | 6086 | #if defined(__SSE2__) |
michael@0 | 6087 | , "xmm0", "xmm1", "xmm5" |
michael@0 | 6088 | #endif |
michael@0 | 6089 | ); |
michael@0 | 6090 | } |
michael@0 | 6091 | #endif // HAS_ARGBSHUFFLEROW_SSE2 |
michael@0 | 6092 | |
michael@0 | 6093 | #ifdef HAS_I422TOYUY2ROW_SSE2 |
michael@0 | 6094 | void I422ToYUY2Row_SSE2(const uint8* src_y, |
michael@0 | 6095 | const uint8* src_u, |
michael@0 | 6096 | const uint8* src_v, |
michael@0 | 6097 | uint8* dst_frame, int width) { |
michael@0 | 6098 | asm volatile ( |
michael@0 | 6099 | "sub %1,%2 \n" |
michael@0 | 6100 | LABELALIGN |
michael@0 | 6101 | "1: \n" |
michael@0 | 6102 | "movq " MEMACCESS(1) ",%%xmm2 \n" |
michael@0 | 6103 | MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 |
michael@0 | 6104 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 6105 | "punpcklbw %%xmm3,%%xmm2 \n" |
michael@0 | 6106 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 6107 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 6108 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 6109 | "punpcklbw %%xmm2,%%xmm0 \n" |
michael@0 | 6110 | "punpckhbw %%xmm2,%%xmm1 \n" |
michael@0 | 6111 | "movdqu %%xmm0," MEMACCESS(3) " \n" |
michael@0 | 6112 | "movdqu %%xmm1," MEMACCESS2(0x10,3) " \n" |
michael@0 | 6113 | "lea " MEMLEA(0x20,3) ",%3 \n" |
michael@0 | 6114 | "sub $0x10,%4 \n" |
michael@0 | 6115 | "jg 1b \n" |
michael@0 | 6116 | : "+r"(src_y), // %0 |
michael@0 | 6117 | "+r"(src_u), // %1 |
michael@0 | 6118 | "+r"(src_v), // %2 |
michael@0 | 6119 | "+r"(dst_frame), // %3 |
michael@0 | 6120 | "+rm"(width) // %4 |
michael@0 | 6121 | : |
michael@0 | 6122 | : "memory", "cc" |
michael@0 | 6123 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 6124 | , "r14" |
michael@0 | 6125 | #endif |
michael@0 | 6126 | #if defined(__SSE2__) |
michael@0 | 6127 | , "xmm0", "xmm1", "xmm2", "xmm3" |
michael@0 | 6128 | #endif |
michael@0 | 6129 | ); |
michael@0 | 6130 | } |
michael@0 | 6131 | #endif // HAS_I422TOYUY2ROW_SSE2 |
michael@0 | 6132 | |
michael@0 | 6133 | #ifdef HAS_I422TOUYVYROW_SSE2 |
michael@0 | 6134 | void I422ToUYVYRow_SSE2(const uint8* src_y, |
michael@0 | 6135 | const uint8* src_u, |
michael@0 | 6136 | const uint8* src_v, |
michael@0 | 6137 | uint8* dst_frame, int width) { |
michael@0 | 6138 | asm volatile ( |
michael@0 | 6139 | "sub %1,%2 \n" |
michael@0 | 6140 | LABELALIGN |
michael@0 | 6141 | "1: \n" |
michael@0 | 6142 | "movq " MEMACCESS(1) ",%%xmm2 \n" |
michael@0 | 6143 | MEMOPREG(movq,0x00,1,2,1,xmm3) // movq (%1,%2,1),%%xmm3 |
michael@0 | 6144 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 6145 | "punpcklbw %%xmm3,%%xmm2 \n" |
michael@0 | 6146 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 6147 | "movdqa %%xmm2,%%xmm1 \n" |
michael@0 | 6148 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 6149 | "punpcklbw %%xmm0,%%xmm1 \n" |
michael@0 | 6150 | "punpckhbw %%xmm0,%%xmm2 \n" |
michael@0 | 6151 | "movdqu %%xmm1," MEMACCESS(3) " \n" |
michael@0 | 6152 | "movdqu %%xmm2," MEMACCESS2(0x10,3) " \n" |
michael@0 | 6153 | "lea " MEMLEA(0x20,3) ",%3 \n" |
michael@0 | 6154 | "sub $0x10,%4 \n" |
michael@0 | 6155 | "jg 1b \n" |
michael@0 | 6156 | : "+r"(src_y), // %0 |
michael@0 | 6157 | "+r"(src_u), // %1 |
michael@0 | 6158 | "+r"(src_v), // %2 |
michael@0 | 6159 | "+r"(dst_frame), // %3 |
michael@0 | 6160 | "+rm"(width) // %4 |
michael@0 | 6161 | : |
michael@0 | 6162 | : "memory", "cc" |
michael@0 | 6163 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 6164 | , "r14" |
michael@0 | 6165 | #endif |
michael@0 | 6166 | #if defined(__SSE2__) |
michael@0 | 6167 | , "xmm0", "xmm1", "xmm2", "xmm3" |
michael@0 | 6168 | #endif |
michael@0 | 6169 | ); |
michael@0 | 6170 | } |
michael@0 | 6171 | #endif // HAS_I422TOUYVYROW_SSE2 |
michael@0 | 6172 | |
michael@0 | 6173 | #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 |
michael@0 | 6174 | void ARGBPolynomialRow_SSE2(const uint8* src_argb, |
michael@0 | 6175 | uint8* dst_argb, const float* poly, |
michael@0 | 6176 | int width) { |
michael@0 | 6177 | asm volatile ( |
michael@0 | 6178 | "pxor %%xmm3,%%xmm3 \n" |
michael@0 | 6179 | |
michael@0 | 6180 | // 2 pixel loop. |
michael@0 | 6181 | LABELALIGN |
michael@0 | 6182 | "1: \n" |
michael@0 | 6183 | "movq " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 6184 | "lea " MEMLEA(0x8,0) ",%0 \n" |
michael@0 | 6185 | "punpcklbw %%xmm3,%%xmm0 \n" |
michael@0 | 6186 | "movdqa %%xmm0,%%xmm4 \n" |
michael@0 | 6187 | "punpcklwd %%xmm3,%%xmm0 \n" |
michael@0 | 6188 | "punpckhwd %%xmm3,%%xmm4 \n" |
michael@0 | 6189 | "cvtdq2ps %%xmm0,%%xmm0 \n" |
michael@0 | 6190 | "cvtdq2ps %%xmm4,%%xmm4 \n" |
michael@0 | 6191 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 6192 | "movdqa %%xmm4,%%xmm5 \n" |
michael@0 | 6193 | "mulps " MEMACCESS2(0x10,3) ",%%xmm0 \n" |
michael@0 | 6194 | "mulps " MEMACCESS2(0x10,3) ",%%xmm4 \n" |
michael@0 | 6195 | "addps " MEMACCESS(3) ",%%xmm0 \n" |
michael@0 | 6196 | "addps " MEMACCESS(3) ",%%xmm4 \n" |
michael@0 | 6197 | "movdqa %%xmm1,%%xmm2 \n" |
michael@0 | 6198 | "movdqa %%xmm5,%%xmm6 \n" |
michael@0 | 6199 | "mulps %%xmm1,%%xmm2 \n" |
michael@0 | 6200 | "mulps %%xmm5,%%xmm6 \n" |
michael@0 | 6201 | "mulps %%xmm2,%%xmm1 \n" |
michael@0 | 6202 | "mulps %%xmm6,%%xmm5 \n" |
michael@0 | 6203 | "mulps " MEMACCESS2(0x20,3) ",%%xmm2 \n" |
michael@0 | 6204 | "mulps " MEMACCESS2(0x20,3) ",%%xmm6 \n" |
michael@0 | 6205 | "mulps " MEMACCESS2(0x30,3) ",%%xmm1 \n" |
michael@0 | 6206 | "mulps " MEMACCESS2(0x30,3) ",%%xmm5 \n" |
michael@0 | 6207 | "addps %%xmm2,%%xmm0 \n" |
michael@0 | 6208 | "addps %%xmm6,%%xmm4 \n" |
michael@0 | 6209 | "addps %%xmm1,%%xmm0 \n" |
michael@0 | 6210 | "addps %%xmm5,%%xmm4 \n" |
michael@0 | 6211 | "cvttps2dq %%xmm0,%%xmm0 \n" |
michael@0 | 6212 | "cvttps2dq %%xmm4,%%xmm4 \n" |
michael@0 | 6213 | "packuswb %%xmm4,%%xmm0 \n" |
michael@0 | 6214 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 6215 | "sub $0x2,%2 \n" |
michael@0 | 6216 | "movq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 6217 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 6218 | "jg 1b \n" |
michael@0 | 6219 | : "+r"(src_argb), // %0 |
michael@0 | 6220 | "+r"(dst_argb), // %1 |
michael@0 | 6221 | "+r"(width) // %2 |
michael@0 | 6222 | : "r"(poly) // %3 |
michael@0 | 6223 | : "memory", "cc" |
michael@0 | 6224 | #if defined(__SSE2__) |
michael@0 | 6225 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
michael@0 | 6226 | #endif |
michael@0 | 6227 | ); |
michael@0 | 6228 | } |
michael@0 | 6229 | #endif // HAS_ARGBPOLYNOMIALROW_SSE2 |
michael@0 | 6230 | |
michael@0 | 6231 | #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 |
michael@0 | 6232 | void ARGBPolynomialRow_AVX2(const uint8* src_argb, |
michael@0 | 6233 | uint8* dst_argb, const float* poly, |
michael@0 | 6234 | int width) { |
michael@0 | 6235 | asm volatile ( |
michael@0 | 6236 | "vbroadcastf128 " MEMACCESS(3) ",%%ymm4 \n" |
michael@0 | 6237 | "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n" |
michael@0 | 6238 | "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n" |
michael@0 | 6239 | "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n" |
michael@0 | 6240 | |
michael@0 | 6241 | // 2 pixel loop. |
michael@0 | 6242 | LABELALIGN |
michael@0 | 6243 | "1: \n" |
michael@0 | 6244 | "vpmovzxbd " MEMACCESS(0) ",%%ymm0 \n" // 2 ARGB pixels |
michael@0 | 6245 | "lea " MEMLEA(0x8,0) ",%0 \n" |
michael@0 | 6246 | "vcvtdq2ps %%ymm0,%%ymm0 \n" // X 8 floats |
michael@0 | 6247 | "vmulps %%ymm0,%%ymm0,%%ymm2 \n" // X * X |
michael@0 | 6248 | "vmulps %%ymm7,%%ymm0,%%ymm3 \n" // C3 * X |
michael@0 | 6249 | "vfmadd132ps %%ymm5,%%ymm4,%%ymm0 \n" // result = C0 + C1 * X |
michael@0 | 6250 | "vfmadd231ps %%ymm6,%%ymm2,%%ymm0 \n" // result += C2 * X * X |
michael@0 | 6251 | "vfmadd231ps %%ymm3,%%ymm2,%%ymm0 \n" // result += C3 * X * X * X |
michael@0 | 6252 | "vcvttps2dq %%ymm0,%%ymm0 \n" |
michael@0 | 6253 | "vpackusdw %%ymm0,%%ymm0,%%ymm0 \n" |
michael@0 | 6254 | "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
michael@0 | 6255 | "vpackuswb %%xmm0,%%xmm0,%%xmm0 \n" |
michael@0 | 6256 | "sub $0x2,%2 \n" |
michael@0 | 6257 | "vmovq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 6258 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 6259 | "jg 1b \n" |
michael@0 | 6260 | "vzeroupper \n" |
michael@0 | 6261 | : "+r"(src_argb), // %0 |
michael@0 | 6262 | "+r"(dst_argb), // %1 |
michael@0 | 6263 | "+r"(width) // %2 |
michael@0 | 6264 | : "r"(poly) // %3 |
michael@0 | 6265 | : "memory", "cc" |
michael@0 | 6266 | #if defined(__SSE2__) |
michael@0 | 6267 | // TODO(fbarchard): declare ymm usage when applicable. |
michael@0 | 6268 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
michael@0 | 6269 | #endif |
michael@0 | 6270 | ); |
michael@0 | 6271 | } |
michael@0 | 6272 | #endif // HAS_ARGBPOLYNOMIALROW_AVX2 |
michael@0 | 6273 | |
michael@0 | 6274 | #ifdef HAS_ARGBCOLORTABLEROW_X86 |
michael@0 | 6275 | // Tranform ARGB pixels with color table. |
michael@0 | 6276 | void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, |
michael@0 | 6277 | int width) { |
michael@0 | 6278 | uintptr_t pixel_temp = 0u; |
michael@0 | 6279 | asm volatile ( |
michael@0 | 6280 | // 1 pixel loop. |
michael@0 | 6281 | LABELALIGN |
michael@0 | 6282 | "1: \n" |
michael@0 | 6283 | "movzb " MEMACCESS(0) ",%1 \n" |
michael@0 | 6284 | "lea " MEMLEA(0x4,0) ",%0 \n" |
michael@0 | 6285 | MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 |
michael@0 | 6286 | "mov %b1," MEMACCESS2(-0x4,0) " \n" |
michael@0 | 6287 | "movzb " MEMACCESS2(-0x3,0) ",%1 \n" |
michael@0 | 6288 | MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 |
michael@0 | 6289 | "mov %b1," MEMACCESS2(-0x3,0) " \n" |
michael@0 | 6290 | "movzb " MEMACCESS2(-0x2,0) ",%1 \n" |
michael@0 | 6291 | MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 |
michael@0 | 6292 | "mov %b1," MEMACCESS2(-0x2,0) " \n" |
michael@0 | 6293 | "movzb " MEMACCESS2(-0x1,0) ",%1 \n" |
michael@0 | 6294 | MEMOPARG(movzb,0x03,3,1,4,1) " \n" // movzb 0x3(%3,%1,4),%1 |
michael@0 | 6295 | "mov %b1," MEMACCESS2(-0x1,0) " \n" |
michael@0 | 6296 | "dec %2 \n" |
michael@0 | 6297 | "jg 1b \n" |
michael@0 | 6298 | : "+r"(dst_argb), // %0 |
michael@0 | 6299 | "+d"(pixel_temp), // %1 |
michael@0 | 6300 | "+r"(width) // %2 |
michael@0 | 6301 | : "r"(table_argb) // %3 |
michael@0 | 6302 | : "memory", "cc"); |
michael@0 | 6303 | } |
michael@0 | 6304 | #endif // HAS_ARGBCOLORTABLEROW_X86 |
michael@0 | 6305 | |
michael@0 | 6306 | #ifdef HAS_RGBCOLORTABLEROW_X86 |
michael@0 | 6307 | // Tranform RGB pixels with color table. |
michael@0 | 6308 | void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { |
michael@0 | 6309 | uintptr_t pixel_temp = 0u; |
michael@0 | 6310 | asm volatile ( |
michael@0 | 6311 | // 1 pixel loop. |
michael@0 | 6312 | LABELALIGN |
michael@0 | 6313 | "1: \n" |
michael@0 | 6314 | "movzb " MEMACCESS(0) ",%1 \n" |
michael@0 | 6315 | "lea " MEMLEA(0x4,0) ",%0 \n" |
michael@0 | 6316 | MEMOPARG(movzb,0x00,3,1,4,1) " \n" // movzb (%3,%1,4),%1 |
michael@0 | 6317 | "mov %b1," MEMACCESS2(-0x4,0) " \n" |
michael@0 | 6318 | "movzb " MEMACCESS2(-0x3,0) ",%1 \n" |
michael@0 | 6319 | MEMOPARG(movzb,0x01,3,1,4,1) " \n" // movzb 0x1(%3,%1,4),%1 |
michael@0 | 6320 | "mov %b1," MEMACCESS2(-0x3,0) " \n" |
michael@0 | 6321 | "movzb " MEMACCESS2(-0x2,0) ",%1 \n" |
michael@0 | 6322 | MEMOPARG(movzb,0x02,3,1,4,1) " \n" // movzb 0x2(%3,%1,4),%1 |
michael@0 | 6323 | "mov %b1," MEMACCESS2(-0x2,0) " \n" |
michael@0 | 6324 | "dec %2 \n" |
michael@0 | 6325 | "jg 1b \n" |
michael@0 | 6326 | : "+r"(dst_argb), // %0 |
michael@0 | 6327 | "+d"(pixel_temp), // %1 |
michael@0 | 6328 | "+r"(width) // %2 |
michael@0 | 6329 | : "r"(table_argb) // %3 |
michael@0 | 6330 | : "memory", "cc"); |
michael@0 | 6331 | } |
michael@0 | 6332 | #endif // HAS_RGBCOLORTABLEROW_X86 |
michael@0 | 6333 | |
michael@0 | 6334 | #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
michael@0 | 6335 | // Tranform RGB pixels with luma table. |
michael@0 | 6336 | void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, |
michael@0 | 6337 | int width, |
michael@0 | 6338 | const uint8* luma, uint32 lumacoeff) { |
michael@0 | 6339 | uintptr_t pixel_temp = 0u; |
michael@0 | 6340 | uintptr_t table_temp = 0u; |
michael@0 | 6341 | asm volatile ( |
michael@0 | 6342 | "movd %6,%%xmm3 \n" |
michael@0 | 6343 | "pshufd $0x0,%%xmm3,%%xmm3 \n" |
michael@0 | 6344 | "pcmpeqb %%xmm4,%%xmm4 \n" |
michael@0 | 6345 | "psllw $0x8,%%xmm4 \n" |
michael@0 | 6346 | "pxor %%xmm5,%%xmm5 \n" |
michael@0 | 6347 | |
michael@0 | 6348 | // 4 pixel loop. |
michael@0 | 6349 | LABELALIGN |
michael@0 | 6350 | "1: \n" |
michael@0 | 6351 | "movdqu " MEMACCESS(2) ",%%xmm0 \n" |
michael@0 | 6352 | "pmaddubsw %%xmm3,%%xmm0 \n" |
michael@0 | 6353 | "phaddw %%xmm0,%%xmm0 \n" |
michael@0 | 6354 | "pand %%xmm4,%%xmm0 \n" |
michael@0 | 6355 | "punpcklwd %%xmm5,%%xmm0 \n" |
michael@0 | 6356 | "movd %%xmm0,%k1 \n" // 32 bit offset |
michael@0 | 6357 | "add %5,%1 \n" |
michael@0 | 6358 | "pshufd $0x39,%%xmm0,%%xmm0 \n" |
michael@0 | 6359 | |
michael@0 | 6360 | "movzb " MEMACCESS(2) ",%0 \n" |
michael@0 | 6361 | MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
michael@0 | 6362 | "mov %b0," MEMACCESS(3) " \n" |
michael@0 | 6363 | "movzb " MEMACCESS2(0x1,2) ",%0 \n" |
michael@0 | 6364 | MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
michael@0 | 6365 | "mov %b0," MEMACCESS2(0x1,3) " \n" |
michael@0 | 6366 | "movzb " MEMACCESS2(0x2,2) ",%0 \n" |
michael@0 | 6367 | MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
michael@0 | 6368 | "mov %b0," MEMACCESS2(0x2,3) " \n" |
michael@0 | 6369 | "movzb " MEMACCESS2(0x3,2) ",%0 \n" |
michael@0 | 6370 | "mov %b0," MEMACCESS2(0x3,3) " \n" |
michael@0 | 6371 | |
michael@0 | 6372 | "movd %%xmm0,%k1 \n" // 32 bit offset |
michael@0 | 6373 | "add %5,%1 \n" |
michael@0 | 6374 | "pshufd $0x39,%%xmm0,%%xmm0 \n" |
michael@0 | 6375 | |
michael@0 | 6376 | "movzb " MEMACCESS2(0x4,2) ",%0 \n" |
michael@0 | 6377 | MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
michael@0 | 6378 | "mov %b0," MEMACCESS2(0x4,3) " \n" |
michael@0 | 6379 | BUNDLEALIGN |
michael@0 | 6380 | "movzb " MEMACCESS2(0x5,2) ",%0 \n" |
michael@0 | 6381 | MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
michael@0 | 6382 | "mov %b0," MEMACCESS2(0x5,3) " \n" |
michael@0 | 6383 | "movzb " MEMACCESS2(0x6,2) ",%0 \n" |
michael@0 | 6384 | MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
michael@0 | 6385 | "mov %b0," MEMACCESS2(0x6,3) " \n" |
michael@0 | 6386 | "movzb " MEMACCESS2(0x7,2) ",%0 \n" |
michael@0 | 6387 | "mov %b0," MEMACCESS2(0x7,3) " \n" |
michael@0 | 6388 | |
michael@0 | 6389 | "movd %%xmm0,%k1 \n" // 32 bit offset |
michael@0 | 6390 | "add %5,%1 \n" |
michael@0 | 6391 | "pshufd $0x39,%%xmm0,%%xmm0 \n" |
michael@0 | 6392 | |
michael@0 | 6393 | "movzb " MEMACCESS2(0x8,2) ",%0 \n" |
michael@0 | 6394 | MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
michael@0 | 6395 | "mov %b0," MEMACCESS2(0x8,3) " \n" |
michael@0 | 6396 | "movzb " MEMACCESS2(0x9,2) ",%0 \n" |
michael@0 | 6397 | MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
michael@0 | 6398 | "mov %b0," MEMACCESS2(0x9,3) " \n" |
michael@0 | 6399 | "movzb " MEMACCESS2(0xa,2) ",%0 \n" |
michael@0 | 6400 | MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
michael@0 | 6401 | "mov %b0," MEMACCESS2(0xa,3) " \n" |
michael@0 | 6402 | "movzb " MEMACCESS2(0xb,2) ",%0 \n" |
michael@0 | 6403 | "mov %b0," MEMACCESS2(0xb,3) " \n" |
michael@0 | 6404 | |
michael@0 | 6405 | "movd %%xmm0,%k1 \n" // 32 bit offset |
michael@0 | 6406 | "add %5,%1 \n" |
michael@0 | 6407 | |
michael@0 | 6408 | "movzb " MEMACCESS2(0xc,2) ",%0 \n" |
michael@0 | 6409 | MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
michael@0 | 6410 | "mov %b0," MEMACCESS2(0xc,3) " \n" |
michael@0 | 6411 | "movzb " MEMACCESS2(0xd,2) ",%0 \n" |
michael@0 | 6412 | MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
michael@0 | 6413 | "mov %b0," MEMACCESS2(0xd,3) " \n" |
michael@0 | 6414 | "movzb " MEMACCESS2(0xe,2) ",%0 \n" |
michael@0 | 6415 | MEMOPARG(movzb,0x00,1,0,1,0) " \n" // movzb (%1,%0,1),%0 |
michael@0 | 6416 | "mov %b0," MEMACCESS2(0xe,3) " \n" |
michael@0 | 6417 | "movzb " MEMACCESS2(0xf,2) ",%0 \n" |
michael@0 | 6418 | "mov %b0," MEMACCESS2(0xf,3) " \n" |
michael@0 | 6419 | "sub $0x4,%4 \n" |
michael@0 | 6420 | "lea " MEMLEA(0x10,2) ",%2 \n" |
michael@0 | 6421 | "lea " MEMLEA(0x10,3) ",%3 \n" |
michael@0 | 6422 | "jg 1b \n" |
michael@0 | 6423 | : "+d"(pixel_temp), // %0 |
michael@0 | 6424 | "+a"(table_temp), // %1 |
michael@0 | 6425 | "+r"(src_argb), // %2 |
michael@0 | 6426 | "+r"(dst_argb), // %3 |
michael@0 | 6427 | "+rm"(width) // %4 |
michael@0 | 6428 | : "r"(luma), // %5 |
michael@0 | 6429 | "rm"(lumacoeff) // %6 |
michael@0 | 6430 | : "memory", "cc" |
michael@0 | 6431 | #if defined(__SSE2__) |
michael@0 | 6432 | , "xmm0", "xmm3", "xmm4", "xmm5" |
michael@0 | 6433 | #endif |
michael@0 | 6434 | ); |
michael@0 | 6435 | } |
michael@0 | 6436 | #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
michael@0 | 6437 | |
michael@0 | 6438 | #endif // defined(__x86_64__) || defined(__i386__) |
michael@0 | 6439 | |
michael@0 | 6440 | #ifdef __cplusplus |
michael@0 | 6441 | } // extern "C" |
michael@0 | 6442 | } // namespace libyuv |
michael@0 | 6443 | #endif |