media/libyuv/source/scale_posix.cc

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved.
michael@0 3 *
michael@0 4 * Use of this source code is governed by a BSD-style license
michael@0 5 * that can be found in the LICENSE file in the root of the source
michael@0 6 * tree. An additional intellectual property rights grant can be found
michael@0 7 * in the file PATENTS. All contributing project authors may
michael@0 8 * be found in the AUTHORS file in the root of the source tree.
michael@0 9 */
michael@0 10
michael@0 11 #include "libyuv/row.h"
michael@0 12
michael@0 13 #ifdef __cplusplus
michael@0 14 namespace libyuv {
michael@0 15 extern "C" {
michael@0 16 #endif
michael@0 17
michael@0 18 // This module is for GCC x86 and x64.
michael@0 19 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
michael@0 20
michael@0 21 // Offsets for source bytes 0 to 9
michael@0 22 static uvec8 kShuf0 =
michael@0 23 { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 };
michael@0 24
michael@0 25 // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12.
michael@0 26 static uvec8 kShuf1 =
michael@0 27 { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 };
michael@0 28
michael@0 29 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
michael@0 30 static uvec8 kShuf2 =
michael@0 31 { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 };
michael@0 32
michael@0 33 // Offsets for source bytes 0 to 10
michael@0 34 static uvec8 kShuf01 =
michael@0 35 { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 };
michael@0 36
michael@0 37 // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13.
michael@0 38 static uvec8 kShuf11 =
michael@0 39 { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 };
michael@0 40
michael@0 41 // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31.
michael@0 42 static uvec8 kShuf21 =
michael@0 43 { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 };
michael@0 44
michael@0 45 // Coefficients for source bytes 0 to 10
michael@0 46 static uvec8 kMadd01 =
michael@0 47 { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 };
michael@0 48
michael@0 49 // Coefficients for source bytes 10 to 21
michael@0 50 static uvec8 kMadd11 =
michael@0 51 { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 };
michael@0 52
michael@0 53 // Coefficients for source bytes 21 to 31
michael@0 54 static uvec8 kMadd21 =
michael@0 55 { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 };
michael@0 56
michael@0 57 // Coefficients for source bytes 21 to 31
michael@0 58 static vec16 kRound34 =
michael@0 59 { 2, 2, 2, 2, 2, 2, 2, 2 };
michael@0 60
michael@0 61 static uvec8 kShuf38a =
michael@0 62 { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
michael@0 63
michael@0 64 static uvec8 kShuf38b =
michael@0 65 { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 };
michael@0 66
michael@0 67 // Arrange words 0,3,6 into 0,1,2
michael@0 68 static uvec8 kShufAc =
michael@0 69 { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 };
michael@0 70
michael@0 71 // Arrange words 0,3,6 into 3,4,5
michael@0 72 static uvec8 kShufAc3 =
michael@0 73 { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 };
michael@0 74
michael@0 75 // Scaling values for boxes of 3x3 and 2x3
michael@0 76 static uvec16 kScaleAc33 =
michael@0 77 { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 };
michael@0 78
michael@0 79 // Arrange first value for pixels 0,1,2,3,4,5
michael@0 80 static uvec8 kShufAb0 =
michael@0 81 { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 };
michael@0 82
michael@0 83 // Arrange second value for pixels 0,1,2,3,4,5
michael@0 84 static uvec8 kShufAb1 =
michael@0 85 { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 };
michael@0 86
michael@0 87 // Arrange third value for pixels 0,1,2,3,4,5
michael@0 88 static uvec8 kShufAb2 =
michael@0 89 { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 };
michael@0 90
michael@0 91 // Scaling values for boxes of 3x2 and 2x2
michael@0 92 static uvec16 kScaleAb2 =
michael@0 93 { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
michael@0 94
michael@0 95 // GCC versions of row functions are verbatim conversions from Visual C.
michael@0 96 // Generated using gcc disassembly on Visual C object file:
michael@0 97 // objdump -D yuvscaler.obj >yuvscaler.txt
michael@0 98
michael@0 99 void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0 100 uint8* dst_ptr, int dst_width) {
michael@0 101 asm volatile (
michael@0 102 LABELALIGN
michael@0 103 "1: \n"
michael@0 104 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 105 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 106 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 107 "psrlw $0x8,%%xmm0 \n"
michael@0 108 "psrlw $0x8,%%xmm1 \n"
michael@0 109 "packuswb %%xmm1,%%xmm0 \n"
michael@0 110 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 111 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 112 "sub $0x10,%2 \n"
michael@0 113 "jg 1b \n"
michael@0 114 : "+r"(src_ptr), // %0
michael@0 115 "+r"(dst_ptr), // %1
michael@0 116 "+r"(dst_width) // %2
michael@0 117 :
michael@0 118 : "memory", "cc"
michael@0 119 #if defined(__SSE2__)
michael@0 120 , "xmm0", "xmm1"
michael@0 121 #endif
michael@0 122 );
michael@0 123 }
michael@0 124
michael@0 125 void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0 126 uint8* dst_ptr, int dst_width) {
michael@0 127 asm volatile (
michael@0 128 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 129 "psrlw $0x8,%%xmm5 \n"
michael@0 130
michael@0 131 LABELALIGN
michael@0 132 "1: \n"
michael@0 133 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 134 "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n"
michael@0 135 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 136 "movdqa %%xmm0,%%xmm2 \n"
michael@0 137 "psrlw $0x8,%%xmm0 \n"
michael@0 138 "movdqa %%xmm1,%%xmm3 \n"
michael@0 139 "psrlw $0x8,%%xmm1 \n"
michael@0 140 "pand %%xmm5,%%xmm2 \n"
michael@0 141 "pand %%xmm5,%%xmm3 \n"
michael@0 142 "pavgw %%xmm2,%%xmm0 \n"
michael@0 143 "pavgw %%xmm3,%%xmm1 \n"
michael@0 144 "packuswb %%xmm1,%%xmm0 \n"
michael@0 145 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 146 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 147 "sub $0x10,%2 \n"
michael@0 148 "jg 1b \n"
michael@0 149 : "+r"(src_ptr), // %0
michael@0 150 "+r"(dst_ptr), // %1
michael@0 151 "+r"(dst_width) // %2
michael@0 152 :
michael@0 153 : "memory", "cc"
michael@0 154 #if defined(__SSE2__)
michael@0 155 , "xmm0", "xmm1", "xmm5"
michael@0 156 #endif
michael@0 157 );
michael@0 158 }
michael@0 159
michael@0 160 void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0 161 uint8* dst_ptr, int dst_width) {
michael@0 162 asm volatile (
michael@0 163 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 164 "psrlw $0x8,%%xmm5 \n"
michael@0 165
michael@0 166 LABELALIGN
michael@0 167 "1: \n"
michael@0 168 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 169 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 170 MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
michael@0 171 BUNDLEALIGN
michael@0 172 MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
michael@0 173 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 174 "pavgb %%xmm2,%%xmm0 \n"
michael@0 175 "pavgb %%xmm3,%%xmm1 \n"
michael@0 176 "movdqa %%xmm0,%%xmm2 \n"
michael@0 177 "psrlw $0x8,%%xmm0 \n"
michael@0 178 "movdqa %%xmm1,%%xmm3 \n"
michael@0 179 "psrlw $0x8,%%xmm1 \n"
michael@0 180 "pand %%xmm5,%%xmm2 \n"
michael@0 181 "pand %%xmm5,%%xmm3 \n"
michael@0 182 "pavgw %%xmm2,%%xmm0 \n"
michael@0 183 "pavgw %%xmm3,%%xmm1 \n"
michael@0 184 "packuswb %%xmm1,%%xmm0 \n"
michael@0 185 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 186 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 187 "sub $0x10,%2 \n"
michael@0 188 "jg 1b \n"
michael@0 189 : "+r"(src_ptr), // %0
michael@0 190 "+r"(dst_ptr), // %1
michael@0 191 "+r"(dst_width) // %2
michael@0 192 : "r"((intptr_t)(src_stride)) // %3
michael@0 193 : "memory", "cc"
michael@0 194 #if defined(__native_client__) && defined(__x86_64__)
michael@0 195 , "r14"
michael@0 196 #endif
michael@0 197 #if defined(__SSE2__)
michael@0 198 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
michael@0 199 #endif
michael@0 200 );
michael@0 201 }
michael@0 202
michael@0 203 void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0 204 uint8* dst_ptr, int dst_width) {
michael@0 205 asm volatile (
michael@0 206 LABELALIGN
michael@0 207 "1: \n"
michael@0 208 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 209 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 210 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 211 "psrlw $0x8,%%xmm0 \n"
michael@0 212 "psrlw $0x8,%%xmm1 \n"
michael@0 213 "packuswb %%xmm1,%%xmm0 \n"
michael@0 214 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 215 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 216 "sub $0x10,%2 \n"
michael@0 217 "jg 1b \n"
michael@0 218 : "+r"(src_ptr), // %0
michael@0 219 "+r"(dst_ptr), // %1
michael@0 220 "+r"(dst_width) // %2
michael@0 221 :
michael@0 222 : "memory", "cc"
michael@0 223 #if defined(__SSE2__)
michael@0 224 , "xmm0", "xmm1"
michael@0 225 #endif
michael@0 226 );
michael@0 227 }
michael@0 228
michael@0 229 void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr,
michael@0 230 ptrdiff_t src_stride,
michael@0 231 uint8* dst_ptr, int dst_width) {
michael@0 232 asm volatile (
michael@0 233 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 234 "psrlw $0x8,%%xmm5 \n"
michael@0 235
michael@0 236 LABELALIGN
michael@0 237 "1: \n"
michael@0 238 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 239 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 240 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 241 "movdqa %%xmm0,%%xmm2 \n"
michael@0 242 "psrlw $0x8,%%xmm0 \n"
michael@0 243 "movdqa %%xmm1,%%xmm3 \n"
michael@0 244 "psrlw $0x8,%%xmm1 \n"
michael@0 245 "pand %%xmm5,%%xmm2 \n"
michael@0 246 "pand %%xmm5,%%xmm3 \n"
michael@0 247 "pavgw %%xmm2,%%xmm0 \n"
michael@0 248 "pavgw %%xmm3,%%xmm1 \n"
michael@0 249 "packuswb %%xmm1,%%xmm0 \n"
michael@0 250 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 251 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 252 "sub $0x10,%2 \n"
michael@0 253 "jg 1b \n"
michael@0 254 : "+r"(src_ptr), // %0
michael@0 255 "+r"(dst_ptr), // %1
michael@0 256 "+r"(dst_width) // %2
michael@0 257 :
michael@0 258 : "memory", "cc"
michael@0 259 #if defined(__SSE2__)
michael@0 260 , "xmm0", "xmm1", "xmm5"
michael@0 261 #endif
michael@0 262 );
michael@0 263 }
michael@0 264
michael@0 265 void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr,
michael@0 266 ptrdiff_t src_stride,
michael@0 267 uint8* dst_ptr, int dst_width) {
michael@0 268 asm volatile (
michael@0 269 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 270 "psrlw $0x8,%%xmm5 \n"
michael@0 271
michael@0 272 LABELALIGN
michael@0 273 "1: \n"
michael@0 274 "movdqu " MEMACCESS(0) ",%%xmm0 \n"
michael@0 275 "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 276 MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2
michael@0 277 BUNDLEALIGN
michael@0 278 MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3
michael@0 279 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 280 "pavgb %%xmm2,%%xmm0 \n"
michael@0 281 "pavgb %%xmm3,%%xmm1 \n"
michael@0 282 "movdqa %%xmm0,%%xmm2 \n"
michael@0 283 "psrlw $0x8,%%xmm0 \n"
michael@0 284 "movdqa %%xmm1,%%xmm3 \n"
michael@0 285 "psrlw $0x8,%%xmm1 \n"
michael@0 286 "pand %%xmm5,%%xmm2 \n"
michael@0 287 "pand %%xmm5,%%xmm3 \n"
michael@0 288 "pavgw %%xmm2,%%xmm0 \n"
michael@0 289 "pavgw %%xmm3,%%xmm1 \n"
michael@0 290 "packuswb %%xmm1,%%xmm0 \n"
michael@0 291 "movdqu %%xmm0," MEMACCESS(1) " \n"
michael@0 292 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 293 "sub $0x10,%2 \n"
michael@0 294 "jg 1b \n"
michael@0 295 : "+r"(src_ptr), // %0
michael@0 296 "+r"(dst_ptr), // %1
michael@0 297 "+r"(dst_width) // %2
michael@0 298 : "r"((intptr_t)(src_stride)) // %3
michael@0 299 : "memory", "cc"
michael@0 300 #if defined(__native_client__) && defined(__x86_64__)
michael@0 301 , "r14"
michael@0 302 #endif
michael@0 303 #if defined(__SSE2__)
michael@0 304 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
michael@0 305 #endif
michael@0 306 );
michael@0 307 }
michael@0 308
michael@0 309 void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0 310 uint8* dst_ptr, int dst_width) {
michael@0 311 asm volatile (
michael@0 312 "pcmpeqb %%xmm5,%%xmm5 \n"
michael@0 313 "psrld $0x18,%%xmm5 \n"
michael@0 314 "pslld $0x10,%%xmm5 \n"
michael@0 315
michael@0 316 LABELALIGN
michael@0 317 "1: \n"
michael@0 318 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 319 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 320 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 321 "pand %%xmm5,%%xmm0 \n"
michael@0 322 "pand %%xmm5,%%xmm1 \n"
michael@0 323 "packuswb %%xmm1,%%xmm0 \n"
michael@0 324 "psrlw $0x8,%%xmm0 \n"
michael@0 325 "packuswb %%xmm0,%%xmm0 \n"
michael@0 326 "movq %%xmm0," MEMACCESS(1) " \n"
michael@0 327 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 328 "sub $0x8,%2 \n"
michael@0 329 "jg 1b \n"
michael@0 330 : "+r"(src_ptr), // %0
michael@0 331 "+r"(dst_ptr), // %1
michael@0 332 "+r"(dst_width) // %2
michael@0 333 :
michael@0 334 : "memory", "cc"
michael@0 335 #if defined(__SSE2__)
michael@0 336 , "xmm0", "xmm1", "xmm5"
michael@0 337 #endif
michael@0 338 );
michael@0 339 }
michael@0 340
michael@0 341 void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0 342 uint8* dst_ptr, int dst_width) {
michael@0 343 intptr_t stridex3 = 0;
michael@0 344 asm volatile (
michael@0 345 "pcmpeqb %%xmm7,%%xmm7 \n"
michael@0 346 "psrlw $0x8,%%xmm7 \n"
michael@0 347 "lea " MEMLEA4(0x00,4,4,2) ",%3 \n"
michael@0 348
michael@0 349 LABELALIGN
michael@0 350 "1: \n"
michael@0 351 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 352 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 353 MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2
michael@0 354 BUNDLEALIGN
michael@0 355 MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3
michael@0 356 "pavgb %%xmm2,%%xmm0 \n"
michael@0 357 "pavgb %%xmm3,%%xmm1 \n"
michael@0 358 MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2
michael@0 359 BUNDLEALIGN
michael@0 360 MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3
michael@0 361 MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4
michael@0 362 MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5
michael@0 363 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 364 "pavgb %%xmm4,%%xmm2 \n"
michael@0 365 "pavgb %%xmm2,%%xmm0 \n"
michael@0 366 "pavgb %%xmm5,%%xmm3 \n"
michael@0 367 "pavgb %%xmm3,%%xmm1 \n"
michael@0 368 "movdqa %%xmm0,%%xmm2 \n"
michael@0 369 "psrlw $0x8,%%xmm0 \n"
michael@0 370 "movdqa %%xmm1,%%xmm3 \n"
michael@0 371 "psrlw $0x8,%%xmm1 \n"
michael@0 372 "pand %%xmm7,%%xmm2 \n"
michael@0 373 "pand %%xmm7,%%xmm3 \n"
michael@0 374 "pavgw %%xmm2,%%xmm0 \n"
michael@0 375 "pavgw %%xmm3,%%xmm1 \n"
michael@0 376 "packuswb %%xmm1,%%xmm0 \n"
michael@0 377 "movdqa %%xmm0,%%xmm2 \n"
michael@0 378 "psrlw $0x8,%%xmm0 \n"
michael@0 379 "pand %%xmm7,%%xmm2 \n"
michael@0 380 "pavgw %%xmm2,%%xmm0 \n"
michael@0 381 "packuswb %%xmm0,%%xmm0 \n"
michael@0 382 "movq %%xmm0," MEMACCESS(1) " \n"
michael@0 383 "lea " MEMLEA(0x8,1) ",%1 \n"
michael@0 384 "sub $0x8,%2 \n"
michael@0 385 "jg 1b \n"
michael@0 386 : "+r"(src_ptr), // %0
michael@0 387 "+r"(dst_ptr), // %1
michael@0 388 "+r"(dst_width), // %2
michael@0 389 "+r"(stridex3) // %3
michael@0 390 : "r"((intptr_t)(src_stride)) // %4
michael@0 391 : "memory", "cc"
michael@0 392 #if defined(__native_client__) && defined(__x86_64__)
michael@0 393 , "r14"
michael@0 394 #endif
michael@0 395 #if defined(__SSE2__)
michael@0 396 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7"
michael@0 397 #endif
michael@0 398 );
michael@0 399 }
michael@0 400
michael@0 401 void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0 402 uint8* dst_ptr, int dst_width) {
michael@0 403 asm volatile (
michael@0 404 "movdqa %0,%%xmm3 \n"
michael@0 405 "movdqa %1,%%xmm4 \n"
michael@0 406 "movdqa %2,%%xmm5 \n"
michael@0 407 :
michael@0 408 : "m"(kShuf0), // %0
michael@0 409 "m"(kShuf1), // %1
michael@0 410 "m"(kShuf2) // %2
michael@0 411 );
michael@0 412 asm volatile (
michael@0 413 LABELALIGN
michael@0 414 "1: \n"
michael@0 415 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 416 "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n"
michael@0 417 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 418 "movdqa %%xmm2,%%xmm1 \n"
michael@0 419 "palignr $0x8,%%xmm0,%%xmm1 \n"
michael@0 420 "pshufb %%xmm3,%%xmm0 \n"
michael@0 421 "pshufb %%xmm4,%%xmm1 \n"
michael@0 422 "pshufb %%xmm5,%%xmm2 \n"
michael@0 423 "movq %%xmm0," MEMACCESS(1) " \n"
michael@0 424 "movq %%xmm1," MEMACCESS2(0x8,1) " \n"
michael@0 425 "movq %%xmm2," MEMACCESS2(0x10,1) " \n"
michael@0 426 "lea " MEMLEA(0x18,1) ",%1 \n"
michael@0 427 "sub $0x18,%2 \n"
michael@0 428 "jg 1b \n"
michael@0 429 : "+r"(src_ptr), // %0
michael@0 430 "+r"(dst_ptr), // %1
michael@0 431 "+r"(dst_width) // %2
michael@0 432 :
michael@0 433 : "memory", "cc"
michael@0 434 #if defined(__SSE2__)
michael@0 435 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
michael@0 436 #endif
michael@0 437 );
michael@0 438 }
michael@0 439
michael@0 440 void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr,
michael@0 441 ptrdiff_t src_stride,
michael@0 442 uint8* dst_ptr, int dst_width) {
michael@0 443 asm volatile (
michael@0 444 "movdqa %0,%%xmm2 \n" // kShuf01
michael@0 445 "movdqa %1,%%xmm3 \n" // kShuf11
michael@0 446 "movdqa %2,%%xmm4 \n" // kShuf21
michael@0 447 :
michael@0 448 : "m"(kShuf01), // %0
michael@0 449 "m"(kShuf11), // %1
michael@0 450 "m"(kShuf21) // %2
michael@0 451 );
michael@0 452 asm volatile (
michael@0 453 "movdqa %0,%%xmm5 \n" // kMadd01
michael@0 454 "movdqa %1,%%xmm0 \n" // kMadd11
michael@0 455 "movdqa %2,%%xmm1 \n" // kRound34
michael@0 456 :
michael@0 457 : "m"(kMadd01), // %0
michael@0 458 "m"(kMadd11), // %1
michael@0 459 "m"(kRound34) // %2
michael@0 460 );
michael@0 461 asm volatile (
michael@0 462 LABELALIGN
michael@0 463 "1: \n"
michael@0 464 "movdqa " MEMACCESS(0) ",%%xmm6 \n"
michael@0 465 MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7
michael@0 466 "pavgb %%xmm7,%%xmm6 \n"
michael@0 467 "pshufb %%xmm2,%%xmm6 \n"
michael@0 468 "pmaddubsw %%xmm5,%%xmm6 \n"
michael@0 469 "paddsw %%xmm1,%%xmm6 \n"
michael@0 470 "psrlw $0x2,%%xmm6 \n"
michael@0 471 "packuswb %%xmm6,%%xmm6 \n"
michael@0 472 "movq %%xmm6," MEMACCESS(1) " \n"
michael@0 473 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
michael@0 474 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7
michael@0 475 "pavgb %%xmm7,%%xmm6 \n"
michael@0 476 "pshufb %%xmm3,%%xmm6 \n"
michael@0 477 "pmaddubsw %%xmm0,%%xmm6 \n"
michael@0 478 "paddsw %%xmm1,%%xmm6 \n"
michael@0 479 "psrlw $0x2,%%xmm6 \n"
michael@0 480 "packuswb %%xmm6,%%xmm6 \n"
michael@0 481 "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
michael@0 482 "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
michael@0 483 BUNDLEALIGN
michael@0 484 MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7
michael@0 485 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 486 "pavgb %%xmm7,%%xmm6 \n"
michael@0 487 "pshufb %%xmm4,%%xmm6 \n"
michael@0 488 "pmaddubsw %4,%%xmm6 \n"
michael@0 489 "paddsw %%xmm1,%%xmm6 \n"
michael@0 490 "psrlw $0x2,%%xmm6 \n"
michael@0 491 "packuswb %%xmm6,%%xmm6 \n"
michael@0 492 "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
michael@0 493 "lea " MEMLEA(0x18,1) ",%1 \n"
michael@0 494 "sub $0x18,%2 \n"
michael@0 495 "jg 1b \n"
michael@0 496 : "+r"(src_ptr), // %0
michael@0 497 "+r"(dst_ptr), // %1
michael@0 498 "+r"(dst_width) // %2
michael@0 499 : "r"((intptr_t)(src_stride)), // %3
michael@0 500 "m"(kMadd21) // %4
michael@0 501 : "memory", "cc"
michael@0 502 #if defined(__native_client__) && defined(__x86_64__)
michael@0 503 , "r14"
michael@0 504 #endif
michael@0 505 #if defined(__SSE2__)
michael@0 506 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
michael@0 507 #endif
michael@0 508 );
michael@0 509 }
michael@0 510
michael@0 511 void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr,
michael@0 512 ptrdiff_t src_stride,
michael@0 513 uint8* dst_ptr, int dst_width) {
michael@0 514 asm volatile (
michael@0 515 "movdqa %0,%%xmm2 \n" // kShuf01
michael@0 516 "movdqa %1,%%xmm3 \n" // kShuf11
michael@0 517 "movdqa %2,%%xmm4 \n" // kShuf21
michael@0 518 :
michael@0 519 : "m"(kShuf01), // %0
michael@0 520 "m"(kShuf11), // %1
michael@0 521 "m"(kShuf21) // %2
michael@0 522 );
michael@0 523 asm volatile (
michael@0 524 "movdqa %0,%%xmm5 \n" // kMadd01
michael@0 525 "movdqa %1,%%xmm0 \n" // kMadd11
michael@0 526 "movdqa %2,%%xmm1 \n" // kRound34
michael@0 527 :
michael@0 528 : "m"(kMadd01), // %0
michael@0 529 "m"(kMadd11), // %1
michael@0 530 "m"(kRound34) // %2
michael@0 531 );
michael@0 532
michael@0 533 asm volatile (
michael@0 534 LABELALIGN
michael@0 535 "1: \n"
michael@0 536 "movdqa " MEMACCESS(0) ",%%xmm6 \n"
michael@0 537 MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7
michael@0 538 "pavgb %%xmm6,%%xmm7 \n"
michael@0 539 "pavgb %%xmm7,%%xmm6 \n"
michael@0 540 "pshufb %%xmm2,%%xmm6 \n"
michael@0 541 "pmaddubsw %%xmm5,%%xmm6 \n"
michael@0 542 "paddsw %%xmm1,%%xmm6 \n"
michael@0 543 "psrlw $0x2,%%xmm6 \n"
michael@0 544 "packuswb %%xmm6,%%xmm6 \n"
michael@0 545 "movq %%xmm6," MEMACCESS(1) " \n"
michael@0 546 "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n"
michael@0 547 MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7
michael@0 548 "pavgb %%xmm6,%%xmm7 \n"
michael@0 549 "pavgb %%xmm7,%%xmm6 \n"
michael@0 550 "pshufb %%xmm3,%%xmm6 \n"
michael@0 551 "pmaddubsw %%xmm0,%%xmm6 \n"
michael@0 552 "paddsw %%xmm1,%%xmm6 \n"
michael@0 553 "psrlw $0x2,%%xmm6 \n"
michael@0 554 "packuswb %%xmm6,%%xmm6 \n"
michael@0 555 "movq %%xmm6," MEMACCESS2(0x8,1) " \n"
michael@0 556 "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n"
michael@0 557 MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7
michael@0 558 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 559 "pavgb %%xmm6,%%xmm7 \n"
michael@0 560 "pavgb %%xmm7,%%xmm6 \n"
michael@0 561 "pshufb %%xmm4,%%xmm6 \n"
michael@0 562 "pmaddubsw %4,%%xmm6 \n"
michael@0 563 "paddsw %%xmm1,%%xmm6 \n"
michael@0 564 "psrlw $0x2,%%xmm6 \n"
michael@0 565 "packuswb %%xmm6,%%xmm6 \n"
michael@0 566 "movq %%xmm6," MEMACCESS2(0x10,1) " \n"
michael@0 567 "lea " MEMLEA(0x18,1) ",%1 \n"
michael@0 568 "sub $0x18,%2 \n"
michael@0 569 "jg 1b \n"
michael@0 570 : "+r"(src_ptr), // %0
michael@0 571 "+r"(dst_ptr), // %1
michael@0 572 "+r"(dst_width) // %2
michael@0 573 : "r"((intptr_t)(src_stride)), // %3
michael@0 574 "m"(kMadd21) // %4
michael@0 575 : "memory", "cc"
michael@0 576 #if defined(__native_client__) && defined(__x86_64__)
michael@0 577 , "r14"
michael@0 578 #endif
michael@0 579 #if defined(__SSE2__)
michael@0 580 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
michael@0 581 #endif
michael@0 582 );
michael@0 583 }
michael@0 584
michael@0 585 void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0 586 uint8* dst_ptr, int dst_width) {
michael@0 587 asm volatile (
michael@0 588 "movdqa %3,%%xmm4 \n"
michael@0 589 "movdqa %4,%%xmm5 \n"
michael@0 590
michael@0 591 LABELALIGN
michael@0 592 "1: \n"
michael@0 593 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 594 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 595 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 596 "pshufb %%xmm4,%%xmm0 \n"
michael@0 597 "pshufb %%xmm5,%%xmm1 \n"
michael@0 598 "paddusb %%xmm1,%%xmm0 \n"
michael@0 599 "movq %%xmm0," MEMACCESS(1) " \n"
michael@0 600 "movhlps %%xmm0,%%xmm1 \n"
michael@0 601 "movd %%xmm1," MEMACCESS2(0x8,1) " \n"
michael@0 602 "lea " MEMLEA(0xc,1) ",%1 \n"
michael@0 603 "sub $0xc,%2 \n"
michael@0 604 "jg 1b \n"
michael@0 605 : "+r"(src_ptr), // %0
michael@0 606 "+r"(dst_ptr), // %1
michael@0 607 "+r"(dst_width) // %2
michael@0 608 : "m"(kShuf38a), // %3
michael@0 609 "m"(kShuf38b) // %4
michael@0 610 : "memory", "cc"
michael@0 611 #if defined(__SSE2__)
michael@0 612 , "xmm0", "xmm1", "xmm4", "xmm5"
michael@0 613 #endif
michael@0 614 );
michael@0 615 }
michael@0 616
michael@0 617 void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr,
michael@0 618 ptrdiff_t src_stride,
michael@0 619 uint8* dst_ptr, int dst_width) {
michael@0 620 asm volatile (
michael@0 621 "movdqa %0,%%xmm2 \n"
michael@0 622 "movdqa %1,%%xmm3 \n"
michael@0 623 "movdqa %2,%%xmm4 \n"
michael@0 624 "movdqa %3,%%xmm5 \n"
michael@0 625 :
michael@0 626 : "m"(kShufAb0), // %0
michael@0 627 "m"(kShufAb1), // %1
michael@0 628 "m"(kShufAb2), // %2
michael@0 629 "m"(kScaleAb2) // %3
michael@0 630 );
michael@0 631 asm volatile (
michael@0 632 LABELALIGN
michael@0 633 "1: \n"
michael@0 634 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 635 MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0
michael@0 636 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 637 "movdqa %%xmm0,%%xmm1 \n"
michael@0 638 "pshufb %%xmm2,%%xmm1 \n"
michael@0 639 "movdqa %%xmm0,%%xmm6 \n"
michael@0 640 "pshufb %%xmm3,%%xmm6 \n"
michael@0 641 "paddusw %%xmm6,%%xmm1 \n"
michael@0 642 "pshufb %%xmm4,%%xmm0 \n"
michael@0 643 "paddusw %%xmm0,%%xmm1 \n"
michael@0 644 "pmulhuw %%xmm5,%%xmm1 \n"
michael@0 645 "packuswb %%xmm1,%%xmm1 \n"
michael@0 646 "sub $0x6,%2 \n"
michael@0 647 "movd %%xmm1," MEMACCESS(1) " \n"
michael@0 648 "psrlq $0x10,%%xmm1 \n"
michael@0 649 "movd %%xmm1," MEMACCESS2(0x2,1) " \n"
michael@0 650 "lea " MEMLEA(0x6,1) ",%1 \n"
michael@0 651 "jg 1b \n"
michael@0 652 : "+r"(src_ptr), // %0
michael@0 653 "+r"(dst_ptr), // %1
michael@0 654 "+r"(dst_width) // %2
michael@0 655 : "r"((intptr_t)(src_stride)) // %3
michael@0 656 : "memory", "cc"
michael@0 657 #if defined(__native_client__) && defined(__x86_64__)
michael@0 658 , "r14"
michael@0 659 #endif
michael@0 660 #if defined(__SSE2__)
michael@0 661 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
michael@0 662 #endif
michael@0 663 );
michael@0 664 }
michael@0 665
michael@0 666 void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr,
michael@0 667 ptrdiff_t src_stride,
michael@0 668 uint8* dst_ptr, int dst_width) {
michael@0 669 asm volatile (
michael@0 670 "movdqa %0,%%xmm2 \n"
michael@0 671 "movdqa %1,%%xmm3 \n"
michael@0 672 "movdqa %2,%%xmm4 \n"
michael@0 673 "pxor %%xmm5,%%xmm5 \n"
michael@0 674 :
michael@0 675 : "m"(kShufAc), // %0
michael@0 676 "m"(kShufAc3), // %1
michael@0 677 "m"(kScaleAc33) // %2
michael@0 678 );
michael@0 679 asm volatile (
michael@0 680 LABELALIGN
michael@0 681 "1: \n"
michael@0 682 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 683 MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6
michael@0 684 "movhlps %%xmm0,%%xmm1 \n"
michael@0 685 "movhlps %%xmm6,%%xmm7 \n"
michael@0 686 "punpcklbw %%xmm5,%%xmm0 \n"
michael@0 687 "punpcklbw %%xmm5,%%xmm1 \n"
michael@0 688 "punpcklbw %%xmm5,%%xmm6 \n"
michael@0 689 "punpcklbw %%xmm5,%%xmm7 \n"
michael@0 690 "paddusw %%xmm6,%%xmm0 \n"
michael@0 691 "paddusw %%xmm7,%%xmm1 \n"
michael@0 692 MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6
michael@0 693 "lea " MEMLEA(0x10,0) ",%0 \n"
michael@0 694 "movhlps %%xmm6,%%xmm7 \n"
michael@0 695 "punpcklbw %%xmm5,%%xmm6 \n"
michael@0 696 "punpcklbw %%xmm5,%%xmm7 \n"
michael@0 697 "paddusw %%xmm6,%%xmm0 \n"
michael@0 698 "paddusw %%xmm7,%%xmm1 \n"
michael@0 699 "movdqa %%xmm0,%%xmm6 \n"
michael@0 700 "psrldq $0x2,%%xmm0 \n"
michael@0 701 "paddusw %%xmm0,%%xmm6 \n"
michael@0 702 "psrldq $0x2,%%xmm0 \n"
michael@0 703 "paddusw %%xmm0,%%xmm6 \n"
michael@0 704 "pshufb %%xmm2,%%xmm6 \n"
michael@0 705 "movdqa %%xmm1,%%xmm7 \n"
michael@0 706 "psrldq $0x2,%%xmm1 \n"
michael@0 707 "paddusw %%xmm1,%%xmm7 \n"
michael@0 708 "psrldq $0x2,%%xmm1 \n"
michael@0 709 "paddusw %%xmm1,%%xmm7 \n"
michael@0 710 "pshufb %%xmm3,%%xmm7 \n"
michael@0 711 "paddusw %%xmm7,%%xmm6 \n"
michael@0 712 "pmulhuw %%xmm4,%%xmm6 \n"
michael@0 713 "packuswb %%xmm6,%%xmm6 \n"
michael@0 714 "sub $0x6,%2 \n"
michael@0 715 "movd %%xmm6," MEMACCESS(1) " \n"
michael@0 716 "psrlq $0x10,%%xmm6 \n"
michael@0 717 "movd %%xmm6," MEMACCESS2(0x2,1) " \n"
michael@0 718 "lea " MEMLEA(0x6,1) ",%1 \n"
michael@0 719 "jg 1b \n"
michael@0 720 : "+r"(src_ptr), // %0
michael@0 721 "+r"(dst_ptr), // %1
michael@0 722 "+r"(dst_width) // %2
michael@0 723 : "r"((intptr_t)(src_stride)) // %3
michael@0 724 : "memory", "cc"
michael@0 725 #if defined(__native_client__) && defined(__x86_64__)
michael@0 726 , "r14"
michael@0 727 #endif
michael@0 728 #if defined(__SSE2__)
michael@0 729 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
michael@0 730 #endif
michael@0 731 );
michael@0 732 }
michael@0 733
michael@0 734 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
michael@0 735 uint16* dst_ptr, int src_width, int src_height) {
michael@0 736 int tmp_height = 0;
michael@0 737 intptr_t tmp_src = 0;
michael@0 738 asm volatile (
michael@0 739 "pxor %%xmm4,%%xmm4 \n"
michael@0 740 "sub $0x1,%5 \n"
michael@0 741
michael@0 742 LABELALIGN
michael@0 743 "1: \n"
michael@0 744 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 745 "mov %0,%3 \n"
michael@0 746 "add %6,%0 \n"
michael@0 747 "movdqa %%xmm0,%%xmm1 \n"
michael@0 748 "punpcklbw %%xmm4,%%xmm0 \n"
michael@0 749 "punpckhbw %%xmm4,%%xmm1 \n"
michael@0 750 "mov %5,%2 \n"
michael@0 751 "test %2,%2 \n"
michael@0 752 "je 3f \n"
michael@0 753
michael@0 754 LABELALIGN
michael@0 755 "2: \n"
michael@0 756 "movdqa " MEMACCESS(0) ",%%xmm2 \n"
michael@0 757 "add %6,%0 \n"
michael@0 758 "movdqa %%xmm2,%%xmm3 \n"
michael@0 759 "punpcklbw %%xmm4,%%xmm2 \n"
michael@0 760 "punpckhbw %%xmm4,%%xmm3 \n"
michael@0 761 "paddusw %%xmm2,%%xmm0 \n"
michael@0 762 "paddusw %%xmm3,%%xmm1 \n"
michael@0 763 "sub $0x1,%2 \n"
michael@0 764 "jg 2b \n"
michael@0 765
michael@0 766 LABELALIGN
michael@0 767 "3: \n"
michael@0 768 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 769 "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n"
michael@0 770 "lea " MEMLEA(0x10,3) ",%0 \n"
michael@0 771 "lea " MEMLEA(0x20,1) ",%1 \n"
michael@0 772 "sub $0x10,%4 \n"
michael@0 773 "jg 1b \n"
michael@0 774 : "+r"(src_ptr), // %0
michael@0 775 "+r"(dst_ptr), // %1
michael@0 776 "+r"(tmp_height), // %2
michael@0 777 "+r"(tmp_src), // %3
michael@0 778 "+r"(src_width), // %4
michael@0 779 "+rm"(src_height) // %5
michael@0 780 : "rm"((intptr_t)(src_stride)) // %6
michael@0 781 : "memory", "cc"
michael@0 782 #if defined(__SSE2__)
michael@0 783 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
michael@0 784 #endif
michael@0 785 );
michael@0 786 }
michael@0 787
michael@0 788 // Bilinear column filtering. SSSE3 version.
michael@0 789 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
michael@0 790 int dst_width, int x, int dx) {
michael@0 791 intptr_t x0 = 0, x1 = 0, temp_pixel = 0;
michael@0 792 asm volatile (
michael@0 793 "movd %6,%%xmm2 \n"
michael@0 794 "movd %7,%%xmm3 \n"
michael@0 795 "movl $0x04040000,%k2 \n"
michael@0 796 "movd %k2,%%xmm5 \n"
michael@0 797 "pcmpeqb %%xmm6,%%xmm6 \n"
michael@0 798 "psrlw $0x9,%%xmm6 \n"
michael@0 799 "pextrw $0x1,%%xmm2,%k3 \n"
michael@0 800 "subl $0x2,%5 \n"
michael@0 801 "jl 29f \n"
michael@0 802 "movdqa %%xmm2,%%xmm0 \n"
michael@0 803 "paddd %%xmm3,%%xmm0 \n"
michael@0 804 "punpckldq %%xmm0,%%xmm2 \n"
michael@0 805 "punpckldq %%xmm3,%%xmm3 \n"
michael@0 806 "paddd %%xmm3,%%xmm3 \n"
michael@0 807 "pextrw $0x3,%%xmm2,%k4 \n"
michael@0 808
michael@0 809 LABELALIGN
michael@0 810 "2: \n"
michael@0 811 "movdqa %%xmm2,%%xmm1 \n"
michael@0 812 "paddd %%xmm3,%%xmm2 \n"
michael@0 813 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
michael@0 814 "movd %k2,%%xmm0 \n"
michael@0 815 "psrlw $0x9,%%xmm1 \n"
michael@0 816 BUNDLEALIGN
michael@0 817 MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2
michael@0 818 "movd %k2,%%xmm4 \n"
michael@0 819 "pshufb %%xmm5,%%xmm1 \n"
michael@0 820 "punpcklwd %%xmm4,%%xmm0 \n"
michael@0 821 "pxor %%xmm6,%%xmm1 \n"
michael@0 822 "pmaddubsw %%xmm1,%%xmm0 \n"
michael@0 823 "pextrw $0x1,%%xmm2,%k3 \n"
michael@0 824 "pextrw $0x3,%%xmm2,%k4 \n"
michael@0 825 "psrlw $0x7,%%xmm0 \n"
michael@0 826 "packuswb %%xmm0,%%xmm0 \n"
michael@0 827 "movd %%xmm0,%k2 \n"
michael@0 828 "mov %w2," MEMACCESS(0) " \n"
michael@0 829 "lea " MEMLEA(0x2,0) ",%0 \n"
michael@0 830 "sub $0x2,%5 \n"
michael@0 831 "jge 2b \n"
michael@0 832
michael@0 833 LABELALIGN
michael@0 834 "29: \n"
michael@0 835 "addl $0x1,%5 \n"
michael@0 836 "jl 99f \n"
michael@0 837 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2
michael@0 838 "movd %k2,%%xmm0 \n"
michael@0 839 "psrlw $0x9,%%xmm2 \n"
michael@0 840 "pshufb %%xmm5,%%xmm2 \n"
michael@0 841 "pxor %%xmm6,%%xmm2 \n"
michael@0 842 "pmaddubsw %%xmm2,%%xmm0 \n"
michael@0 843 "psrlw $0x7,%%xmm0 \n"
michael@0 844 "packuswb %%xmm0,%%xmm0 \n"
michael@0 845 "movd %%xmm0,%k2 \n"
michael@0 846 "mov %b2," MEMACCESS(0) " \n"
michael@0 847 "99: \n"
michael@0 848 : "+r"(dst_ptr), // %0
michael@0 849 "+r"(src_ptr), // %1
michael@0 850 "+a"(temp_pixel), // %2
michael@0 851 "+r"(x0), // %3
michael@0 852 "+r"(x1), // %4
michael@0 853 "+rm"(dst_width) // %5
michael@0 854 : "rm"(x), // %6
michael@0 855 "rm"(dx) // %7
michael@0 856 : "memory", "cc"
michael@0 857 #if defined(__native_client__) && defined(__x86_64__)
michael@0 858 , "r14"
michael@0 859 #endif
michael@0 860 #if defined(__SSE2__)
michael@0 861 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
michael@0 862 #endif
michael@0 863 );
michael@0 864 }
michael@0 865
michael@0 866 // Reads 4 pixels, duplicates them and writes 8 pixels.
michael@0 867 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
michael@0 868 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr,
michael@0 869 int dst_width, int x, int dx) {
michael@0 870 asm volatile (
michael@0 871 LABELALIGN
michael@0 872 "1: \n"
michael@0 873 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
michael@0 874 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 875 "movdqa %%xmm0,%%xmm1 \n"
michael@0 876 "punpcklbw %%xmm0,%%xmm0 \n"
michael@0 877 "punpckhbw %%xmm1,%%xmm1 \n"
michael@0 878 "sub $0x20,%2 \n"
michael@0 879 "movdqa %%xmm0," MEMACCESS(0) " \n"
michael@0 880 "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
michael@0 881 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 882 "jg 1b \n"
michael@0 883
michael@0 884 : "+r"(dst_ptr), // %0
michael@0 885 "+r"(src_ptr), // %1
michael@0 886 "+r"(dst_width) // %2
michael@0 887 :
michael@0 888 : "memory", "cc"
michael@0 889 #if defined(__SSE2__)
michael@0 890 , "xmm0", "xmm1"
michael@0 891 #endif
michael@0 892 );
michael@0 893 }
michael@0 894
michael@0 895 void ScaleARGBRowDown2_SSE2(const uint8* src_argb,
michael@0 896 ptrdiff_t src_stride,
michael@0 897 uint8* dst_argb, int dst_width) {
michael@0 898 asm volatile (
michael@0 899 LABELALIGN
michael@0 900 "1: \n"
michael@0 901 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 902 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 903 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 904 "shufps $0xdd,%%xmm1,%%xmm0 \n"
michael@0 905 "sub $0x4,%2 \n"
michael@0 906 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 907 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 908 "jg 1b \n"
michael@0 909 : "+r"(src_argb), // %0
michael@0 910 "+r"(dst_argb), // %1
michael@0 911 "+r"(dst_width) // %2
michael@0 912 :
michael@0 913 : "memory", "cc"
michael@0 914 #if defined(__SSE2__)
michael@0 915 , "xmm0", "xmm1"
michael@0 916 #endif
michael@0 917 );
michael@0 918 }
michael@0 919
michael@0 920 void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb,
michael@0 921 ptrdiff_t src_stride,
michael@0 922 uint8* dst_argb, int dst_width) {
michael@0 923 asm volatile (
michael@0 924 LABELALIGN
michael@0 925 "1: \n"
michael@0 926 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 927 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 928 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 929 "movdqa %%xmm0,%%xmm2 \n"
michael@0 930 "shufps $0x88,%%xmm1,%%xmm0 \n"
michael@0 931 "shufps $0xdd,%%xmm1,%%xmm2 \n"
michael@0 932 "pavgb %%xmm2,%%xmm0 \n"
michael@0 933 "sub $0x4,%2 \n"
michael@0 934 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 935 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 936 "jg 1b \n"
michael@0 937 : "+r"(src_argb), // %0
michael@0 938 "+r"(dst_argb), // %1
michael@0 939 "+r"(dst_width) // %2
michael@0 940 :
michael@0 941 : "memory", "cc"
michael@0 942 #if defined(__SSE2__)
michael@0 943 , "xmm0", "xmm1"
michael@0 944 #endif
michael@0 945 );
michael@0 946 }
michael@0 947
michael@0 948 void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb,
michael@0 949 ptrdiff_t src_stride,
michael@0 950 uint8* dst_argb, int dst_width) {
michael@0 951 asm volatile (
michael@0 952 LABELALIGN
michael@0 953 "1: \n"
michael@0 954 "movdqa " MEMACCESS(0) ",%%xmm0 \n"
michael@0 955 "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n"
michael@0 956 BUNDLEALIGN
michael@0 957 MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2
michael@0 958 MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3
michael@0 959 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 960 "pavgb %%xmm2,%%xmm0 \n"
michael@0 961 "pavgb %%xmm3,%%xmm1 \n"
michael@0 962 "movdqa %%xmm0,%%xmm2 \n"
michael@0 963 "shufps $0x88,%%xmm1,%%xmm0 \n"
michael@0 964 "shufps $0xdd,%%xmm1,%%xmm2 \n"
michael@0 965 "pavgb %%xmm2,%%xmm0 \n"
michael@0 966 "sub $0x4,%2 \n"
michael@0 967 "movdqa %%xmm0," MEMACCESS(1) " \n"
michael@0 968 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 969 "jg 1b \n"
michael@0 970 : "+r"(src_argb), // %0
michael@0 971 "+r"(dst_argb), // %1
michael@0 972 "+r"(dst_width) // %2
michael@0 973 : "r"((intptr_t)(src_stride)) // %3
michael@0 974 : "memory", "cc"
michael@0 975 #if defined(__native_client__) && defined(__x86_64__)
michael@0 976 , "r14"
michael@0 977 #endif
michael@0 978 #if defined(__SSE2__)
michael@0 979 , "xmm0", "xmm1", "xmm2", "xmm3"
michael@0 980 #endif
michael@0 981 );
michael@0 982 }
michael@0 983
michael@0 984 // Reads 4 pixels at a time.
michael@0 985 // Alignment requirement: dst_argb 16 byte aligned.
michael@0 986 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride,
michael@0 987 int src_stepx,
michael@0 988 uint8* dst_argb, int dst_width) {
michael@0 989 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
michael@0 990 intptr_t src_stepx_x12 = 0;
michael@0 991 asm volatile (
michael@0 992 "lea " MEMLEA3(0x00,1,4) ",%1 \n"
michael@0 993 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
michael@0 994 LABELALIGN
michael@0 995 "1: \n"
michael@0 996 "movd " MEMACCESS(0) ",%%xmm0 \n"
michael@0 997 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1
michael@0 998 "punpckldq %%xmm1,%%xmm0 \n"
michael@0 999 BUNDLEALIGN
michael@0 1000 MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2
michael@0 1001 MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3
michael@0 1002 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
michael@0 1003 "punpckldq %%xmm3,%%xmm2 \n"
michael@0 1004 "punpcklqdq %%xmm2,%%xmm0 \n"
michael@0 1005 "sub $0x4,%3 \n"
michael@0 1006 "movdqa %%xmm0," MEMACCESS(2) " \n"
michael@0 1007 "lea " MEMLEA(0x10,2) ",%2 \n"
michael@0 1008 "jg 1b \n"
michael@0 1009 : "+r"(src_argb), // %0
michael@0 1010 "+r"(src_stepx_x4), // %1
michael@0 1011 "+r"(dst_argb), // %2
michael@0 1012 "+r"(dst_width), // %3
michael@0 1013 "+r"(src_stepx_x12) // %4
michael@0 1014 :
michael@0 1015 : "memory", "cc"
michael@0 1016 #if defined(__native_client__) && defined(__x86_64__)
michael@0 1017 , "r14"
michael@0 1018 #endif
michael@0 1019 #if defined(__SSE2__)
michael@0 1020 , "xmm0", "xmm1", "xmm2", "xmm3"
michael@0 1021 #endif
michael@0 1022 );
michael@0 1023 }
michael@0 1024
michael@0 1025 // Blends four 2x2 to 4x1.
michael@0 1026 // Alignment requirement: dst_argb 16 byte aligned.
michael@0 1027 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb,
michael@0 1028 ptrdiff_t src_stride, int src_stepx,
michael@0 1029 uint8* dst_argb, int dst_width) {
michael@0 1030 intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
michael@0 1031 intptr_t src_stepx_x12 = 0;
michael@0 1032 intptr_t row1 = (intptr_t)(src_stride);
michael@0 1033 asm volatile (
michael@0 1034 "lea " MEMLEA3(0x00,1,4) ",%1 \n"
michael@0 1035 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n"
michael@0 1036 "lea " MEMLEA4(0x00,0,5,1) ",%5 \n"
michael@0 1037
michael@0 1038 LABELALIGN
michael@0 1039 "1: \n"
michael@0 1040 "movq " MEMACCESS(0) ",%%xmm0 \n"
michael@0 1041 MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0
michael@0 1042 MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1
michael@0 1043 BUNDLEALIGN
michael@0 1044 MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1
michael@0 1045 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n"
michael@0 1046 "movq " MEMACCESS(5) ",%%xmm2 \n"
michael@0 1047 BUNDLEALIGN
michael@0 1048 MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2
michael@0 1049 MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3
michael@0 1050 MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3
michael@0 1051 "lea " MEMLEA4(0x00,5,1,4) ",%5 \n"
michael@0 1052 "pavgb %%xmm2,%%xmm0 \n"
michael@0 1053 "pavgb %%xmm3,%%xmm1 \n"
michael@0 1054 "movdqa %%xmm0,%%xmm2 \n"
michael@0 1055 "shufps $0x88,%%xmm1,%%xmm0 \n"
michael@0 1056 "shufps $0xdd,%%xmm1,%%xmm2 \n"
michael@0 1057 "pavgb %%xmm2,%%xmm0 \n"
michael@0 1058 "sub $0x4,%3 \n"
michael@0 1059 "movdqa %%xmm0," MEMACCESS(2) " \n"
michael@0 1060 "lea " MEMLEA(0x10,2) ",%2 \n"
michael@0 1061 "jg 1b \n"
michael@0 1062 : "+r"(src_argb), // %0
michael@0 1063 "+r"(src_stepx_x4), // %1
michael@0 1064 "+r"(dst_argb), // %2
michael@0 1065 "+rm"(dst_width), // %3
michael@0 1066 "+r"(src_stepx_x12), // %4
michael@0 1067 "+r"(row1) // %5
michael@0 1068 :
michael@0 1069 : "memory", "cc"
michael@0 1070 #if defined(__native_client__) && defined(__x86_64__)
michael@0 1071 , "r14"
michael@0 1072 #endif
michael@0 1073 #if defined(__SSE2__)
michael@0 1074 , "xmm0", "xmm1", "xmm2", "xmm3"
michael@0 1075 #endif
michael@0 1076 );
michael@0 1077 }
michael@0 1078
michael@0 1079 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb,
michael@0 1080 int dst_width, int x, int dx) {
michael@0 1081 intptr_t x0 = 0, x1 = 0;
michael@0 1082 asm volatile (
michael@0 1083 "movd %5,%%xmm2 \n"
michael@0 1084 "movd %6,%%xmm3 \n"
michael@0 1085 "pshufd $0x0,%%xmm2,%%xmm2 \n"
michael@0 1086 "pshufd $0x11,%%xmm3,%%xmm0 \n"
michael@0 1087 "paddd %%xmm0,%%xmm2 \n"
michael@0 1088 "paddd %%xmm3,%%xmm3 \n"
michael@0 1089 "pshufd $0x5,%%xmm3,%%xmm0 \n"
michael@0 1090 "paddd %%xmm0,%%xmm2 \n"
michael@0 1091 "paddd %%xmm3,%%xmm3 \n"
michael@0 1092 "pshufd $0x0,%%xmm3,%%xmm3 \n"
michael@0 1093 "pextrw $0x1,%%xmm2,%k0 \n"
michael@0 1094 "pextrw $0x3,%%xmm2,%k1 \n"
michael@0 1095 "cmp $0x0,%4 \n"
michael@0 1096 "jl 99f \n"
michael@0 1097 "sub $0x4,%4 \n"
michael@0 1098 "jl 49f \n"
michael@0 1099
michael@0 1100 LABELALIGN
michael@0 1101 "40: \n"
michael@0 1102 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
michael@0 1103 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
michael@0 1104 "pextrw $0x5,%%xmm2,%k0 \n"
michael@0 1105 "pextrw $0x7,%%xmm2,%k1 \n"
michael@0 1106 "paddd %%xmm3,%%xmm2 \n"
michael@0 1107 "punpckldq %%xmm1,%%xmm0 \n"
michael@0 1108 MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1
michael@0 1109 MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4
michael@0 1110 "pextrw $0x1,%%xmm2,%k0 \n"
michael@0 1111 "pextrw $0x3,%%xmm2,%k1 \n"
michael@0 1112 "punpckldq %%xmm4,%%xmm1 \n"
michael@0 1113 "punpcklqdq %%xmm1,%%xmm0 \n"
michael@0 1114 "sub $0x4,%4 \n"
michael@0 1115 "movdqu %%xmm0," MEMACCESS(2) " \n"
michael@0 1116 "lea " MEMLEA(0x10,2) ",%2 \n"
michael@0 1117 "jge 40b \n"
michael@0 1118
michael@0 1119 "49: \n"
michael@0 1120 "test $0x2,%4 \n"
michael@0 1121 "je 29f \n"
michael@0 1122 BUNDLEALIGN
michael@0 1123 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
michael@0 1124 MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1
michael@0 1125 "pextrw $0x5,%%xmm2,%k0 \n"
michael@0 1126 "punpckldq %%xmm1,%%xmm0 \n"
michael@0 1127 "movq %%xmm0," MEMACCESS(2) " \n"
michael@0 1128 "lea " MEMLEA(0x8,2) ",%2 \n"
michael@0 1129 "29: \n"
michael@0 1130 "test $0x1,%4 \n"
michael@0 1131 "je 99f \n"
michael@0 1132 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0
michael@0 1133 "movd %%xmm0," MEMACCESS(2) " \n"
michael@0 1134 "99: \n"
michael@0 1135 : "+a"(x0), // %0
michael@0 1136 "+d"(x1), // %1
michael@0 1137 "+r"(dst_argb), // %2
michael@0 1138 "+r"(src_argb), // %3
michael@0 1139 "+r"(dst_width) // %4
michael@0 1140 : "rm"(x), // %5
michael@0 1141 "rm"(dx) // %6
michael@0 1142 : "memory", "cc"
michael@0 1143 #if defined(__native_client__) && defined(__x86_64__)
michael@0 1144 , "r14"
michael@0 1145 #endif
michael@0 1146 #if defined(__SSE2__)
michael@0 1147 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
michael@0 1148 #endif
michael@0 1149 );
michael@0 1150 }
michael@0 1151
michael@0 1152 // Reads 4 pixels, duplicates them and writes 8 pixels.
michael@0 1153 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned.
michael@0 1154 void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
michael@0 1155 int dst_width, int x, int dx) {
michael@0 1156 asm volatile (
michael@0 1157 LABELALIGN
michael@0 1158 "1: \n"
michael@0 1159 "movdqa " MEMACCESS(1) ",%%xmm0 \n"
michael@0 1160 "lea " MEMLEA(0x10,1) ",%1 \n"
michael@0 1161 "movdqa %%xmm0,%%xmm1 \n"
michael@0 1162 "punpckldq %%xmm0,%%xmm0 \n"
michael@0 1163 "punpckhdq %%xmm1,%%xmm1 \n"
michael@0 1164 "sub $0x8,%2 \n"
michael@0 1165 "movdqa %%xmm0," MEMACCESS(0) " \n"
michael@0 1166 "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n"
michael@0 1167 "lea " MEMLEA(0x20,0) ",%0 \n"
michael@0 1168 "jg 1b \n"
michael@0 1169
michael@0 1170 : "+r"(dst_argb), // %0
michael@0 1171 "+r"(src_argb), // %1
michael@0 1172 "+r"(dst_width) // %2
michael@0 1173 :
michael@0 1174 : "memory", "cc"
michael@0 1175 #if defined(__native_client__) && defined(__x86_64__)
michael@0 1176 , "r14"
michael@0 1177 #endif
michael@0 1178 #if defined(__SSE2__)
michael@0 1179 , "xmm0", "xmm1"
michael@0 1180 #endif
michael@0 1181 );
michael@0 1182 }
michael@0 1183
michael@0 1184 // Shuffle table for arranging 2 pixels into pairs for pmaddubsw
michael@0 1185 static uvec8 kShuffleColARGB = {
michael@0 1186 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel
michael@0 1187 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel
michael@0 1188 };
michael@0 1189
michael@0 1190 // Shuffle table for duplicating 2 fractions into 8 bytes each
michael@0 1191 static uvec8 kShuffleFractions = {
michael@0 1192 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u,
michael@0 1193 };
michael@0 1194
michael@0 1195 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
michael@0 1196 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
michael@0 1197 int dst_width, int x, int dx) {
michael@0 1198 intptr_t x0 = 0, x1 = 0;
michael@0 1199 asm volatile (
michael@0 1200 "movdqa %0,%%xmm4 \n"
michael@0 1201 "movdqa %1,%%xmm5 \n"
michael@0 1202 :
michael@0 1203 : "m"(kShuffleColARGB), // %0
michael@0 1204 "m"(kShuffleFractions) // %1
michael@0 1205 );
michael@0 1206
michael@0 1207 asm volatile (
michael@0 1208 "movd %5,%%xmm2 \n"
michael@0 1209 "movd %6,%%xmm3 \n"
michael@0 1210 "pcmpeqb %%xmm6,%%xmm6 \n"
michael@0 1211 "psrlw $0x9,%%xmm6 \n"
michael@0 1212 "pextrw $0x1,%%xmm2,%k3 \n"
michael@0 1213 "sub $0x2,%2 \n"
michael@0 1214 "jl 29f \n"
michael@0 1215 "movdqa %%xmm2,%%xmm0 \n"
michael@0 1216 "paddd %%xmm3,%%xmm0 \n"
michael@0 1217 "punpckldq %%xmm0,%%xmm2 \n"
michael@0 1218 "punpckldq %%xmm3,%%xmm3 \n"
michael@0 1219 "paddd %%xmm3,%%xmm3 \n"
michael@0 1220 "pextrw $0x3,%%xmm2,%k4 \n"
michael@0 1221
michael@0 1222 LABELALIGN
michael@0 1223 "2: \n"
michael@0 1224 "movdqa %%xmm2,%%xmm1 \n"
michael@0 1225 "paddd %%xmm3,%%xmm2 \n"
michael@0 1226 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
michael@0 1227 "psrlw $0x9,%%xmm1 \n"
michael@0 1228 BUNDLEALIGN
michael@0 1229 MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0
michael@0 1230 "pshufb %%xmm5,%%xmm1 \n"
michael@0 1231 "pshufb %%xmm4,%%xmm0 \n"
michael@0 1232 "pxor %%xmm6,%%xmm1 \n"
michael@0 1233 "pmaddubsw %%xmm1,%%xmm0 \n"
michael@0 1234 "psrlw $0x7,%%xmm0 \n"
michael@0 1235 "pextrw $0x1,%%xmm2,%k3 \n"
michael@0 1236 "pextrw $0x3,%%xmm2,%k4 \n"
michael@0 1237 "packuswb %%xmm0,%%xmm0 \n"
michael@0 1238 "movq %%xmm0," MEMACCESS(0) " \n"
michael@0 1239 "lea " MEMLEA(0x8,0) ",%0 \n"
michael@0 1240 "sub $0x2,%2 \n"
michael@0 1241 "jge 2b \n"
michael@0 1242
michael@0 1243 LABELALIGN
michael@0 1244 "29: \n"
michael@0 1245 "add $0x1,%2 \n"
michael@0 1246 "jl 99f \n"
michael@0 1247 "psrlw $0x9,%%xmm2 \n"
michael@0 1248 BUNDLEALIGN
michael@0 1249 MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0
michael@0 1250 "pshufb %%xmm5,%%xmm2 \n"
michael@0 1251 "pshufb %%xmm4,%%xmm0 \n"
michael@0 1252 "pxor %%xmm6,%%xmm2 \n"
michael@0 1253 "pmaddubsw %%xmm2,%%xmm0 \n"
michael@0 1254 "psrlw $0x7,%%xmm0 \n"
michael@0 1255 "packuswb %%xmm0,%%xmm0 \n"
michael@0 1256 "movd %%xmm0," MEMACCESS(0) " \n"
michael@0 1257
michael@0 1258 LABELALIGN
michael@0 1259 "99: \n"
michael@0 1260 : "+r"(dst_argb), // %0
michael@0 1261 "+r"(src_argb), // %1
michael@0 1262 "+rm"(dst_width), // %2
michael@0 1263 "+r"(x0), // %3
michael@0 1264 "+r"(x1) // %4
michael@0 1265 : "rm"(x), // %5
michael@0 1266 "rm"(dx) // %6
michael@0 1267 : "memory", "cc"
michael@0 1268 #if defined(__native_client__) && defined(__x86_64__)
michael@0 1269 , "r14"
michael@0 1270 #endif
michael@0 1271 #if defined(__SSE2__)
michael@0 1272 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
michael@0 1273 #endif
michael@0 1274 );
michael@0 1275 }
michael@0 1276
michael@0 1277 // Divide num by div and return as 16.16 fixed point result.
michael@0 1278 int FixedDiv_X86(int num, int div) {
michael@0 1279 asm volatile (
michael@0 1280 "cdq \n"
michael@0 1281 "shld $0x10,%%eax,%%edx \n"
michael@0 1282 "shl $0x10,%%eax \n"
michael@0 1283 "idiv %1 \n"
michael@0 1284 "mov %0, %%eax \n"
michael@0 1285 : "+a"(num) // %0
michael@0 1286 : "c"(div) // %1
michael@0 1287 : "memory", "cc", "edx"
michael@0 1288 );
michael@0 1289 return num;
michael@0 1290 }
michael@0 1291
michael@0 1292 // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
michael@0 1293 int FixedDiv1_X86(int num, int div) {
michael@0 1294 asm volatile (
michael@0 1295 "cdq \n"
michael@0 1296 "shld $0x10,%%eax,%%edx \n"
michael@0 1297 "shl $0x10,%%eax \n"
michael@0 1298 "sub $0x10001,%%eax \n"
michael@0 1299 "sbb $0x0,%%edx \n"
michael@0 1300 "sub $0x1,%1 \n"
michael@0 1301 "idiv %1 \n"
michael@0 1302 "mov %0, %%eax \n"
michael@0 1303 : "+a"(num) // %0
michael@0 1304 : "c"(div) // %1
michael@0 1305 : "memory", "cc", "edx"
michael@0 1306 );
michael@0 1307 return num;
michael@0 1308 }
michael@0 1309
michael@0 1310 #endif // defined(__x86_64__) || defined(__i386__)
michael@0 1311
michael@0 1312 #ifdef __cplusplus
michael@0 1313 } // extern "C"
michael@0 1314 } // namespace libyuv
michael@0 1315 #endif

mercurial