Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
michael@0 | 3 | * |
michael@0 | 4 | * Use of this source code is governed by a BSD-style license |
michael@0 | 5 | * that can be found in the LICENSE file in the root of the source |
michael@0 | 6 | * tree. An additional intellectual property rights grant can be found |
michael@0 | 7 | * in the file PATENTS. All contributing project authors may |
michael@0 | 8 | * be found in the AUTHORS file in the root of the source tree. |
michael@0 | 9 | */ |
michael@0 | 10 | |
michael@0 | 11 | #include "libyuv/row.h" |
michael@0 | 12 | |
michael@0 | 13 | #ifdef __cplusplus |
michael@0 | 14 | namespace libyuv { |
michael@0 | 15 | extern "C" { |
michael@0 | 16 | #endif |
michael@0 | 17 | |
michael@0 | 18 | // This module is for GCC x86 and x64. |
michael@0 | 19 | #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) |
michael@0 | 20 | |
michael@0 | 21 | // Offsets for source bytes 0 to 9 |
michael@0 | 22 | static uvec8 kShuf0 = |
michael@0 | 23 | { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; |
michael@0 | 24 | |
michael@0 | 25 | // Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. |
michael@0 | 26 | static uvec8 kShuf1 = |
michael@0 | 27 | { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; |
michael@0 | 28 | |
michael@0 | 29 | // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. |
michael@0 | 30 | static uvec8 kShuf2 = |
michael@0 | 31 | { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; |
michael@0 | 32 | |
michael@0 | 33 | // Offsets for source bytes 0 to 10 |
michael@0 | 34 | static uvec8 kShuf01 = |
michael@0 | 35 | { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; |
michael@0 | 36 | |
michael@0 | 37 | // Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. |
michael@0 | 38 | static uvec8 kShuf11 = |
michael@0 | 39 | { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; |
michael@0 | 40 | |
michael@0 | 41 | // Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. |
michael@0 | 42 | static uvec8 kShuf21 = |
michael@0 | 43 | { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; |
michael@0 | 44 | |
michael@0 | 45 | // Coefficients for source bytes 0 to 10 |
michael@0 | 46 | static uvec8 kMadd01 = |
michael@0 | 47 | { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; |
michael@0 | 48 | |
michael@0 | 49 | // Coefficients for source bytes 10 to 21 |
michael@0 | 50 | static uvec8 kMadd11 = |
michael@0 | 51 | { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; |
michael@0 | 52 | |
michael@0 | 53 | // Coefficients for source bytes 21 to 31 |
michael@0 | 54 | static uvec8 kMadd21 = |
michael@0 | 55 | { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; |
michael@0 | 56 | |
michael@0 | 57 | // Coefficients for source bytes 21 to 31 |
michael@0 | 58 | static vec16 kRound34 = |
michael@0 | 59 | { 2, 2, 2, 2, 2, 2, 2, 2 }; |
michael@0 | 60 | |
michael@0 | 61 | static uvec8 kShuf38a = |
michael@0 | 62 | { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; |
michael@0 | 63 | |
michael@0 | 64 | static uvec8 kShuf38b = |
michael@0 | 65 | { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; |
michael@0 | 66 | |
michael@0 | 67 | // Arrange words 0,3,6 into 0,1,2 |
michael@0 | 68 | static uvec8 kShufAc = |
michael@0 | 69 | { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; |
michael@0 | 70 | |
michael@0 | 71 | // Arrange words 0,3,6 into 3,4,5 |
michael@0 | 72 | static uvec8 kShufAc3 = |
michael@0 | 73 | { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; |
michael@0 | 74 | |
michael@0 | 75 | // Scaling values for boxes of 3x3 and 2x3 |
michael@0 | 76 | static uvec16 kScaleAc33 = |
michael@0 | 77 | { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; |
michael@0 | 78 | |
michael@0 | 79 | // Arrange first value for pixels 0,1,2,3,4,5 |
michael@0 | 80 | static uvec8 kShufAb0 = |
michael@0 | 81 | { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; |
michael@0 | 82 | |
michael@0 | 83 | // Arrange second value for pixels 0,1,2,3,4,5 |
michael@0 | 84 | static uvec8 kShufAb1 = |
michael@0 | 85 | { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; |
michael@0 | 86 | |
michael@0 | 87 | // Arrange third value for pixels 0,1,2,3,4,5 |
michael@0 | 88 | static uvec8 kShufAb2 = |
michael@0 | 89 | { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; |
michael@0 | 90 | |
michael@0 | 91 | // Scaling values for boxes of 3x2 and 2x2 |
michael@0 | 92 | static uvec16 kScaleAb2 = |
michael@0 | 93 | { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; |
michael@0 | 94 | |
michael@0 | 95 | // GCC versions of row functions are verbatim conversions from Visual C. |
michael@0 | 96 | // Generated using gcc disassembly on Visual C object file: |
michael@0 | 97 | // objdump -D yuvscaler.obj >yuvscaler.txt |
michael@0 | 98 | |
michael@0 | 99 | void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 100 | uint8* dst_ptr, int dst_width) { |
michael@0 | 101 | asm volatile ( |
michael@0 | 102 | LABELALIGN |
michael@0 | 103 | "1: \n" |
michael@0 | 104 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 105 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 106 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 107 | "psrlw $0x8,%%xmm0 \n" |
michael@0 | 108 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 109 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 110 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 111 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 112 | "sub $0x10,%2 \n" |
michael@0 | 113 | "jg 1b \n" |
michael@0 | 114 | : "+r"(src_ptr), // %0 |
michael@0 | 115 | "+r"(dst_ptr), // %1 |
michael@0 | 116 | "+r"(dst_width) // %2 |
michael@0 | 117 | : |
michael@0 | 118 | : "memory", "cc" |
michael@0 | 119 | #if defined(__SSE2__) |
michael@0 | 120 | , "xmm0", "xmm1" |
michael@0 | 121 | #endif |
michael@0 | 122 | ); |
michael@0 | 123 | } |
michael@0 | 124 | |
michael@0 | 125 | void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 126 | uint8* dst_ptr, int dst_width) { |
michael@0 | 127 | asm volatile ( |
michael@0 | 128 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 129 | "psrlw $0x8,%%xmm5 \n" |
michael@0 | 130 | |
michael@0 | 131 | LABELALIGN |
michael@0 | 132 | "1: \n" |
michael@0 | 133 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 134 | "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n" |
michael@0 | 135 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 136 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 137 | "psrlw $0x8,%%xmm0 \n" |
michael@0 | 138 | "movdqa %%xmm1,%%xmm3 \n" |
michael@0 | 139 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 140 | "pand %%xmm5,%%xmm2 \n" |
michael@0 | 141 | "pand %%xmm5,%%xmm3 \n" |
michael@0 | 142 | "pavgw %%xmm2,%%xmm0 \n" |
michael@0 | 143 | "pavgw %%xmm3,%%xmm1 \n" |
michael@0 | 144 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 145 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 146 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 147 | "sub $0x10,%2 \n" |
michael@0 | 148 | "jg 1b \n" |
michael@0 | 149 | : "+r"(src_ptr), // %0 |
michael@0 | 150 | "+r"(dst_ptr), // %1 |
michael@0 | 151 | "+r"(dst_width) // %2 |
michael@0 | 152 | : |
michael@0 | 153 | : "memory", "cc" |
michael@0 | 154 | #if defined(__SSE2__) |
michael@0 | 155 | , "xmm0", "xmm1", "xmm5" |
michael@0 | 156 | #endif |
michael@0 | 157 | ); |
michael@0 | 158 | } |
michael@0 | 159 | |
michael@0 | 160 | void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 161 | uint8* dst_ptr, int dst_width) { |
michael@0 | 162 | asm volatile ( |
michael@0 | 163 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 164 | "psrlw $0x8,%%xmm5 \n" |
michael@0 | 165 | |
michael@0 | 166 | LABELALIGN |
michael@0 | 167 | "1: \n" |
michael@0 | 168 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 169 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 170 | MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 |
michael@0 | 171 | BUNDLEALIGN |
michael@0 | 172 | MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 |
michael@0 | 173 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 174 | "pavgb %%xmm2,%%xmm0 \n" |
michael@0 | 175 | "pavgb %%xmm3,%%xmm1 \n" |
michael@0 | 176 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 177 | "psrlw $0x8,%%xmm0 \n" |
michael@0 | 178 | "movdqa %%xmm1,%%xmm3 \n" |
michael@0 | 179 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 180 | "pand %%xmm5,%%xmm2 \n" |
michael@0 | 181 | "pand %%xmm5,%%xmm3 \n" |
michael@0 | 182 | "pavgw %%xmm2,%%xmm0 \n" |
michael@0 | 183 | "pavgw %%xmm3,%%xmm1 \n" |
michael@0 | 184 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 185 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 186 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 187 | "sub $0x10,%2 \n" |
michael@0 | 188 | "jg 1b \n" |
michael@0 | 189 | : "+r"(src_ptr), // %0 |
michael@0 | 190 | "+r"(dst_ptr), // %1 |
michael@0 | 191 | "+r"(dst_width) // %2 |
michael@0 | 192 | : "r"((intptr_t)(src_stride)) // %3 |
michael@0 | 193 | : "memory", "cc" |
michael@0 | 194 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 195 | , "r14" |
michael@0 | 196 | #endif |
michael@0 | 197 | #if defined(__SSE2__) |
michael@0 | 198 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
michael@0 | 199 | #endif |
michael@0 | 200 | ); |
michael@0 | 201 | } |
michael@0 | 202 | |
michael@0 | 203 | void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 204 | uint8* dst_ptr, int dst_width) { |
michael@0 | 205 | asm volatile ( |
michael@0 | 206 | LABELALIGN |
michael@0 | 207 | "1: \n" |
michael@0 | 208 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 209 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 210 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 211 | "psrlw $0x8,%%xmm0 \n" |
michael@0 | 212 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 213 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 214 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 215 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 216 | "sub $0x10,%2 \n" |
michael@0 | 217 | "jg 1b \n" |
michael@0 | 218 | : "+r"(src_ptr), // %0 |
michael@0 | 219 | "+r"(dst_ptr), // %1 |
michael@0 | 220 | "+r"(dst_width) // %2 |
michael@0 | 221 | : |
michael@0 | 222 | : "memory", "cc" |
michael@0 | 223 | #if defined(__SSE2__) |
michael@0 | 224 | , "xmm0", "xmm1" |
michael@0 | 225 | #endif |
michael@0 | 226 | ); |
michael@0 | 227 | } |
michael@0 | 228 | |
michael@0 | 229 | void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, |
michael@0 | 230 | ptrdiff_t src_stride, |
michael@0 | 231 | uint8* dst_ptr, int dst_width) { |
michael@0 | 232 | asm volatile ( |
michael@0 | 233 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 234 | "psrlw $0x8,%%xmm5 \n" |
michael@0 | 235 | |
michael@0 | 236 | LABELALIGN |
michael@0 | 237 | "1: \n" |
michael@0 | 238 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 239 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 240 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 241 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 242 | "psrlw $0x8,%%xmm0 \n" |
michael@0 | 243 | "movdqa %%xmm1,%%xmm3 \n" |
michael@0 | 244 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 245 | "pand %%xmm5,%%xmm2 \n" |
michael@0 | 246 | "pand %%xmm5,%%xmm3 \n" |
michael@0 | 247 | "pavgw %%xmm2,%%xmm0 \n" |
michael@0 | 248 | "pavgw %%xmm3,%%xmm1 \n" |
michael@0 | 249 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 250 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 251 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 252 | "sub $0x10,%2 \n" |
michael@0 | 253 | "jg 1b \n" |
michael@0 | 254 | : "+r"(src_ptr), // %0 |
michael@0 | 255 | "+r"(dst_ptr), // %1 |
michael@0 | 256 | "+r"(dst_width) // %2 |
michael@0 | 257 | : |
michael@0 | 258 | : "memory", "cc" |
michael@0 | 259 | #if defined(__SSE2__) |
michael@0 | 260 | , "xmm0", "xmm1", "xmm5" |
michael@0 | 261 | #endif |
michael@0 | 262 | ); |
michael@0 | 263 | } |
michael@0 | 264 | |
michael@0 | 265 | void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, |
michael@0 | 266 | ptrdiff_t src_stride, |
michael@0 | 267 | uint8* dst_ptr, int dst_width) { |
michael@0 | 268 | asm volatile ( |
michael@0 | 269 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 270 | "psrlw $0x8,%%xmm5 \n" |
michael@0 | 271 | |
michael@0 | 272 | LABELALIGN |
michael@0 | 273 | "1: \n" |
michael@0 | 274 | "movdqu " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 275 | "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 276 | MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 |
michael@0 | 277 | BUNDLEALIGN |
michael@0 | 278 | MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 |
michael@0 | 279 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 280 | "pavgb %%xmm2,%%xmm0 \n" |
michael@0 | 281 | "pavgb %%xmm3,%%xmm1 \n" |
michael@0 | 282 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 283 | "psrlw $0x8,%%xmm0 \n" |
michael@0 | 284 | "movdqa %%xmm1,%%xmm3 \n" |
michael@0 | 285 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 286 | "pand %%xmm5,%%xmm2 \n" |
michael@0 | 287 | "pand %%xmm5,%%xmm3 \n" |
michael@0 | 288 | "pavgw %%xmm2,%%xmm0 \n" |
michael@0 | 289 | "pavgw %%xmm3,%%xmm1 \n" |
michael@0 | 290 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 291 | "movdqu %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 292 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 293 | "sub $0x10,%2 \n" |
michael@0 | 294 | "jg 1b \n" |
michael@0 | 295 | : "+r"(src_ptr), // %0 |
michael@0 | 296 | "+r"(dst_ptr), // %1 |
michael@0 | 297 | "+r"(dst_width) // %2 |
michael@0 | 298 | : "r"((intptr_t)(src_stride)) // %3 |
michael@0 | 299 | : "memory", "cc" |
michael@0 | 300 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 301 | , "r14" |
michael@0 | 302 | #endif |
michael@0 | 303 | #if defined(__SSE2__) |
michael@0 | 304 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
michael@0 | 305 | #endif |
michael@0 | 306 | ); |
michael@0 | 307 | } |
michael@0 | 308 | |
michael@0 | 309 | void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 310 | uint8* dst_ptr, int dst_width) { |
michael@0 | 311 | asm volatile ( |
michael@0 | 312 | "pcmpeqb %%xmm5,%%xmm5 \n" |
michael@0 | 313 | "psrld $0x18,%%xmm5 \n" |
michael@0 | 314 | "pslld $0x10,%%xmm5 \n" |
michael@0 | 315 | |
michael@0 | 316 | LABELALIGN |
michael@0 | 317 | "1: \n" |
michael@0 | 318 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 319 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 320 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 321 | "pand %%xmm5,%%xmm0 \n" |
michael@0 | 322 | "pand %%xmm5,%%xmm1 \n" |
michael@0 | 323 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 324 | "psrlw $0x8,%%xmm0 \n" |
michael@0 | 325 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 326 | "movq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 327 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 328 | "sub $0x8,%2 \n" |
michael@0 | 329 | "jg 1b \n" |
michael@0 | 330 | : "+r"(src_ptr), // %0 |
michael@0 | 331 | "+r"(dst_ptr), // %1 |
michael@0 | 332 | "+r"(dst_width) // %2 |
michael@0 | 333 | : |
michael@0 | 334 | : "memory", "cc" |
michael@0 | 335 | #if defined(__SSE2__) |
michael@0 | 336 | , "xmm0", "xmm1", "xmm5" |
michael@0 | 337 | #endif |
michael@0 | 338 | ); |
michael@0 | 339 | } |
michael@0 | 340 | |
michael@0 | 341 | void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 342 | uint8* dst_ptr, int dst_width) { |
michael@0 | 343 | intptr_t stridex3 = 0; |
michael@0 | 344 | asm volatile ( |
michael@0 | 345 | "pcmpeqb %%xmm7,%%xmm7 \n" |
michael@0 | 346 | "psrlw $0x8,%%xmm7 \n" |
michael@0 | 347 | "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" |
michael@0 | 348 | |
michael@0 | 349 | LABELALIGN |
michael@0 | 350 | "1: \n" |
michael@0 | 351 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 352 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 353 | MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 |
michael@0 | 354 | BUNDLEALIGN |
michael@0 | 355 | MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 |
michael@0 | 356 | "pavgb %%xmm2,%%xmm0 \n" |
michael@0 | 357 | "pavgb %%xmm3,%%xmm1 \n" |
michael@0 | 358 | MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2 |
michael@0 | 359 | BUNDLEALIGN |
michael@0 | 360 | MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3 |
michael@0 | 361 | MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4 |
michael@0 | 362 | MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5 |
michael@0 | 363 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 364 | "pavgb %%xmm4,%%xmm2 \n" |
michael@0 | 365 | "pavgb %%xmm2,%%xmm0 \n" |
michael@0 | 366 | "pavgb %%xmm5,%%xmm3 \n" |
michael@0 | 367 | "pavgb %%xmm3,%%xmm1 \n" |
michael@0 | 368 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 369 | "psrlw $0x8,%%xmm0 \n" |
michael@0 | 370 | "movdqa %%xmm1,%%xmm3 \n" |
michael@0 | 371 | "psrlw $0x8,%%xmm1 \n" |
michael@0 | 372 | "pand %%xmm7,%%xmm2 \n" |
michael@0 | 373 | "pand %%xmm7,%%xmm3 \n" |
michael@0 | 374 | "pavgw %%xmm2,%%xmm0 \n" |
michael@0 | 375 | "pavgw %%xmm3,%%xmm1 \n" |
michael@0 | 376 | "packuswb %%xmm1,%%xmm0 \n" |
michael@0 | 377 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 378 | "psrlw $0x8,%%xmm0 \n" |
michael@0 | 379 | "pand %%xmm7,%%xmm2 \n" |
michael@0 | 380 | "pavgw %%xmm2,%%xmm0 \n" |
michael@0 | 381 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 382 | "movq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 383 | "lea " MEMLEA(0x8,1) ",%1 \n" |
michael@0 | 384 | "sub $0x8,%2 \n" |
michael@0 | 385 | "jg 1b \n" |
michael@0 | 386 | : "+r"(src_ptr), // %0 |
michael@0 | 387 | "+r"(dst_ptr), // %1 |
michael@0 | 388 | "+r"(dst_width), // %2 |
michael@0 | 389 | "+r"(stridex3) // %3 |
michael@0 | 390 | : "r"((intptr_t)(src_stride)) // %4 |
michael@0 | 391 | : "memory", "cc" |
michael@0 | 392 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 393 | , "r14" |
michael@0 | 394 | #endif |
michael@0 | 395 | #if defined(__SSE2__) |
michael@0 | 396 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" |
michael@0 | 397 | #endif |
michael@0 | 398 | ); |
michael@0 | 399 | } |
michael@0 | 400 | |
michael@0 | 401 | void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 402 | uint8* dst_ptr, int dst_width) { |
michael@0 | 403 | asm volatile ( |
michael@0 | 404 | "movdqa %0,%%xmm3 \n" |
michael@0 | 405 | "movdqa %1,%%xmm4 \n" |
michael@0 | 406 | "movdqa %2,%%xmm5 \n" |
michael@0 | 407 | : |
michael@0 | 408 | : "m"(kShuf0), // %0 |
michael@0 | 409 | "m"(kShuf1), // %1 |
michael@0 | 410 | "m"(kShuf2) // %2 |
michael@0 | 411 | ); |
michael@0 | 412 | asm volatile ( |
michael@0 | 413 | LABELALIGN |
michael@0 | 414 | "1: \n" |
michael@0 | 415 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 416 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n" |
michael@0 | 417 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 418 | "movdqa %%xmm2,%%xmm1 \n" |
michael@0 | 419 | "palignr $0x8,%%xmm0,%%xmm1 \n" |
michael@0 | 420 | "pshufb %%xmm3,%%xmm0 \n" |
michael@0 | 421 | "pshufb %%xmm4,%%xmm1 \n" |
michael@0 | 422 | "pshufb %%xmm5,%%xmm2 \n" |
michael@0 | 423 | "movq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 424 | "movq %%xmm1," MEMACCESS2(0x8,1) " \n" |
michael@0 | 425 | "movq %%xmm2," MEMACCESS2(0x10,1) " \n" |
michael@0 | 426 | "lea " MEMLEA(0x18,1) ",%1 \n" |
michael@0 | 427 | "sub $0x18,%2 \n" |
michael@0 | 428 | "jg 1b \n" |
michael@0 | 429 | : "+r"(src_ptr), // %0 |
michael@0 | 430 | "+r"(dst_ptr), // %1 |
michael@0 | 431 | "+r"(dst_width) // %2 |
michael@0 | 432 | : |
michael@0 | 433 | : "memory", "cc" |
michael@0 | 434 | #if defined(__SSE2__) |
michael@0 | 435 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
michael@0 | 436 | #endif |
michael@0 | 437 | ); |
michael@0 | 438 | } |
michael@0 | 439 | |
michael@0 | 440 | void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, |
michael@0 | 441 | ptrdiff_t src_stride, |
michael@0 | 442 | uint8* dst_ptr, int dst_width) { |
michael@0 | 443 | asm volatile ( |
michael@0 | 444 | "movdqa %0,%%xmm2 \n" // kShuf01 |
michael@0 | 445 | "movdqa %1,%%xmm3 \n" // kShuf11 |
michael@0 | 446 | "movdqa %2,%%xmm4 \n" // kShuf21 |
michael@0 | 447 | : |
michael@0 | 448 | : "m"(kShuf01), // %0 |
michael@0 | 449 | "m"(kShuf11), // %1 |
michael@0 | 450 | "m"(kShuf21) // %2 |
michael@0 | 451 | ); |
michael@0 | 452 | asm volatile ( |
michael@0 | 453 | "movdqa %0,%%xmm5 \n" // kMadd01 |
michael@0 | 454 | "movdqa %1,%%xmm0 \n" // kMadd11 |
michael@0 | 455 | "movdqa %2,%%xmm1 \n" // kRound34 |
michael@0 | 456 | : |
michael@0 | 457 | : "m"(kMadd01), // %0 |
michael@0 | 458 | "m"(kMadd11), // %1 |
michael@0 | 459 | "m"(kRound34) // %2 |
michael@0 | 460 | ); |
michael@0 | 461 | asm volatile ( |
michael@0 | 462 | LABELALIGN |
michael@0 | 463 | "1: \n" |
michael@0 | 464 | "movdqa " MEMACCESS(0) ",%%xmm6 \n" |
michael@0 | 465 | MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7 |
michael@0 | 466 | "pavgb %%xmm7,%%xmm6 \n" |
michael@0 | 467 | "pshufb %%xmm2,%%xmm6 \n" |
michael@0 | 468 | "pmaddubsw %%xmm5,%%xmm6 \n" |
michael@0 | 469 | "paddsw %%xmm1,%%xmm6 \n" |
michael@0 | 470 | "psrlw $0x2,%%xmm6 \n" |
michael@0 | 471 | "packuswb %%xmm6,%%xmm6 \n" |
michael@0 | 472 | "movq %%xmm6," MEMACCESS(1) " \n" |
michael@0 | 473 | "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" |
michael@0 | 474 | MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7 |
michael@0 | 475 | "pavgb %%xmm7,%%xmm6 \n" |
michael@0 | 476 | "pshufb %%xmm3,%%xmm6 \n" |
michael@0 | 477 | "pmaddubsw %%xmm0,%%xmm6 \n" |
michael@0 | 478 | "paddsw %%xmm1,%%xmm6 \n" |
michael@0 | 479 | "psrlw $0x2,%%xmm6 \n" |
michael@0 | 480 | "packuswb %%xmm6,%%xmm6 \n" |
michael@0 | 481 | "movq %%xmm6," MEMACCESS2(0x8,1) " \n" |
michael@0 | 482 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" |
michael@0 | 483 | BUNDLEALIGN |
michael@0 | 484 | MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7 |
michael@0 | 485 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 486 | "pavgb %%xmm7,%%xmm6 \n" |
michael@0 | 487 | "pshufb %%xmm4,%%xmm6 \n" |
michael@0 | 488 | "pmaddubsw %4,%%xmm6 \n" |
michael@0 | 489 | "paddsw %%xmm1,%%xmm6 \n" |
michael@0 | 490 | "psrlw $0x2,%%xmm6 \n" |
michael@0 | 491 | "packuswb %%xmm6,%%xmm6 \n" |
michael@0 | 492 | "movq %%xmm6," MEMACCESS2(0x10,1) " \n" |
michael@0 | 493 | "lea " MEMLEA(0x18,1) ",%1 \n" |
michael@0 | 494 | "sub $0x18,%2 \n" |
michael@0 | 495 | "jg 1b \n" |
michael@0 | 496 | : "+r"(src_ptr), // %0 |
michael@0 | 497 | "+r"(dst_ptr), // %1 |
michael@0 | 498 | "+r"(dst_width) // %2 |
michael@0 | 499 | : "r"((intptr_t)(src_stride)), // %3 |
michael@0 | 500 | "m"(kMadd21) // %4 |
michael@0 | 501 | : "memory", "cc" |
michael@0 | 502 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 503 | , "r14" |
michael@0 | 504 | #endif |
michael@0 | 505 | #if defined(__SSE2__) |
michael@0 | 506 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
michael@0 | 507 | #endif |
michael@0 | 508 | ); |
michael@0 | 509 | } |
michael@0 | 510 | |
michael@0 | 511 | void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, |
michael@0 | 512 | ptrdiff_t src_stride, |
michael@0 | 513 | uint8* dst_ptr, int dst_width) { |
michael@0 | 514 | asm volatile ( |
michael@0 | 515 | "movdqa %0,%%xmm2 \n" // kShuf01 |
michael@0 | 516 | "movdqa %1,%%xmm3 \n" // kShuf11 |
michael@0 | 517 | "movdqa %2,%%xmm4 \n" // kShuf21 |
michael@0 | 518 | : |
michael@0 | 519 | : "m"(kShuf01), // %0 |
michael@0 | 520 | "m"(kShuf11), // %1 |
michael@0 | 521 | "m"(kShuf21) // %2 |
michael@0 | 522 | ); |
michael@0 | 523 | asm volatile ( |
michael@0 | 524 | "movdqa %0,%%xmm5 \n" // kMadd01 |
michael@0 | 525 | "movdqa %1,%%xmm0 \n" // kMadd11 |
michael@0 | 526 | "movdqa %2,%%xmm1 \n" // kRound34 |
michael@0 | 527 | : |
michael@0 | 528 | : "m"(kMadd01), // %0 |
michael@0 | 529 | "m"(kMadd11), // %1 |
michael@0 | 530 | "m"(kRound34) // %2 |
michael@0 | 531 | ); |
michael@0 | 532 | |
michael@0 | 533 | asm volatile ( |
michael@0 | 534 | LABELALIGN |
michael@0 | 535 | "1: \n" |
michael@0 | 536 | "movdqa " MEMACCESS(0) ",%%xmm6 \n" |
michael@0 | 537 | MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7 |
michael@0 | 538 | "pavgb %%xmm6,%%xmm7 \n" |
michael@0 | 539 | "pavgb %%xmm7,%%xmm6 \n" |
michael@0 | 540 | "pshufb %%xmm2,%%xmm6 \n" |
michael@0 | 541 | "pmaddubsw %%xmm5,%%xmm6 \n" |
michael@0 | 542 | "paddsw %%xmm1,%%xmm6 \n" |
michael@0 | 543 | "psrlw $0x2,%%xmm6 \n" |
michael@0 | 544 | "packuswb %%xmm6,%%xmm6 \n" |
michael@0 | 545 | "movq %%xmm6," MEMACCESS(1) " \n" |
michael@0 | 546 | "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" |
michael@0 | 547 | MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7 |
michael@0 | 548 | "pavgb %%xmm6,%%xmm7 \n" |
michael@0 | 549 | "pavgb %%xmm7,%%xmm6 \n" |
michael@0 | 550 | "pshufb %%xmm3,%%xmm6 \n" |
michael@0 | 551 | "pmaddubsw %%xmm0,%%xmm6 \n" |
michael@0 | 552 | "paddsw %%xmm1,%%xmm6 \n" |
michael@0 | 553 | "psrlw $0x2,%%xmm6 \n" |
michael@0 | 554 | "packuswb %%xmm6,%%xmm6 \n" |
michael@0 | 555 | "movq %%xmm6," MEMACCESS2(0x8,1) " \n" |
michael@0 | 556 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" |
michael@0 | 557 | MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7 |
michael@0 | 558 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 559 | "pavgb %%xmm6,%%xmm7 \n" |
michael@0 | 560 | "pavgb %%xmm7,%%xmm6 \n" |
michael@0 | 561 | "pshufb %%xmm4,%%xmm6 \n" |
michael@0 | 562 | "pmaddubsw %4,%%xmm6 \n" |
michael@0 | 563 | "paddsw %%xmm1,%%xmm6 \n" |
michael@0 | 564 | "psrlw $0x2,%%xmm6 \n" |
michael@0 | 565 | "packuswb %%xmm6,%%xmm6 \n" |
michael@0 | 566 | "movq %%xmm6," MEMACCESS2(0x10,1) " \n" |
michael@0 | 567 | "lea " MEMLEA(0x18,1) ",%1 \n" |
michael@0 | 568 | "sub $0x18,%2 \n" |
michael@0 | 569 | "jg 1b \n" |
michael@0 | 570 | : "+r"(src_ptr), // %0 |
michael@0 | 571 | "+r"(dst_ptr), // %1 |
michael@0 | 572 | "+r"(dst_width) // %2 |
michael@0 | 573 | : "r"((intptr_t)(src_stride)), // %3 |
michael@0 | 574 | "m"(kMadd21) // %4 |
michael@0 | 575 | : "memory", "cc" |
michael@0 | 576 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 577 | , "r14" |
michael@0 | 578 | #endif |
michael@0 | 579 | #if defined(__SSE2__) |
michael@0 | 580 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
michael@0 | 581 | #endif |
michael@0 | 582 | ); |
michael@0 | 583 | } |
michael@0 | 584 | |
michael@0 | 585 | void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 586 | uint8* dst_ptr, int dst_width) { |
michael@0 | 587 | asm volatile ( |
michael@0 | 588 | "movdqa %3,%%xmm4 \n" |
michael@0 | 589 | "movdqa %4,%%xmm5 \n" |
michael@0 | 590 | |
michael@0 | 591 | LABELALIGN |
michael@0 | 592 | "1: \n" |
michael@0 | 593 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 594 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 595 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 596 | "pshufb %%xmm4,%%xmm0 \n" |
michael@0 | 597 | "pshufb %%xmm5,%%xmm1 \n" |
michael@0 | 598 | "paddusb %%xmm1,%%xmm0 \n" |
michael@0 | 599 | "movq %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 600 | "movhlps %%xmm0,%%xmm1 \n" |
michael@0 | 601 | "movd %%xmm1," MEMACCESS2(0x8,1) " \n" |
michael@0 | 602 | "lea " MEMLEA(0xc,1) ",%1 \n" |
michael@0 | 603 | "sub $0xc,%2 \n" |
michael@0 | 604 | "jg 1b \n" |
michael@0 | 605 | : "+r"(src_ptr), // %0 |
michael@0 | 606 | "+r"(dst_ptr), // %1 |
michael@0 | 607 | "+r"(dst_width) // %2 |
michael@0 | 608 | : "m"(kShuf38a), // %3 |
michael@0 | 609 | "m"(kShuf38b) // %4 |
michael@0 | 610 | : "memory", "cc" |
michael@0 | 611 | #if defined(__SSE2__) |
michael@0 | 612 | , "xmm0", "xmm1", "xmm4", "xmm5" |
michael@0 | 613 | #endif |
michael@0 | 614 | ); |
michael@0 | 615 | } |
michael@0 | 616 | |
michael@0 | 617 | void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, |
michael@0 | 618 | ptrdiff_t src_stride, |
michael@0 | 619 | uint8* dst_ptr, int dst_width) { |
michael@0 | 620 | asm volatile ( |
michael@0 | 621 | "movdqa %0,%%xmm2 \n" |
michael@0 | 622 | "movdqa %1,%%xmm3 \n" |
michael@0 | 623 | "movdqa %2,%%xmm4 \n" |
michael@0 | 624 | "movdqa %3,%%xmm5 \n" |
michael@0 | 625 | : |
michael@0 | 626 | : "m"(kShufAb0), // %0 |
michael@0 | 627 | "m"(kShufAb1), // %1 |
michael@0 | 628 | "m"(kShufAb2), // %2 |
michael@0 | 629 | "m"(kScaleAb2) // %3 |
michael@0 | 630 | ); |
michael@0 | 631 | asm volatile ( |
michael@0 | 632 | LABELALIGN |
michael@0 | 633 | "1: \n" |
michael@0 | 634 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 635 | MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0 |
michael@0 | 636 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 637 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 638 | "pshufb %%xmm2,%%xmm1 \n" |
michael@0 | 639 | "movdqa %%xmm0,%%xmm6 \n" |
michael@0 | 640 | "pshufb %%xmm3,%%xmm6 \n" |
michael@0 | 641 | "paddusw %%xmm6,%%xmm1 \n" |
michael@0 | 642 | "pshufb %%xmm4,%%xmm0 \n" |
michael@0 | 643 | "paddusw %%xmm0,%%xmm1 \n" |
michael@0 | 644 | "pmulhuw %%xmm5,%%xmm1 \n" |
michael@0 | 645 | "packuswb %%xmm1,%%xmm1 \n" |
michael@0 | 646 | "sub $0x6,%2 \n" |
michael@0 | 647 | "movd %%xmm1," MEMACCESS(1) " \n" |
michael@0 | 648 | "psrlq $0x10,%%xmm1 \n" |
michael@0 | 649 | "movd %%xmm1," MEMACCESS2(0x2,1) " \n" |
michael@0 | 650 | "lea " MEMLEA(0x6,1) ",%1 \n" |
michael@0 | 651 | "jg 1b \n" |
michael@0 | 652 | : "+r"(src_ptr), // %0 |
michael@0 | 653 | "+r"(dst_ptr), // %1 |
michael@0 | 654 | "+r"(dst_width) // %2 |
michael@0 | 655 | : "r"((intptr_t)(src_stride)) // %3 |
michael@0 | 656 | : "memory", "cc" |
michael@0 | 657 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 658 | , "r14" |
michael@0 | 659 | #endif |
michael@0 | 660 | #if defined(__SSE2__) |
michael@0 | 661 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
michael@0 | 662 | #endif |
michael@0 | 663 | ); |
michael@0 | 664 | } |
michael@0 | 665 | |
michael@0 | 666 | void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, |
michael@0 | 667 | ptrdiff_t src_stride, |
michael@0 | 668 | uint8* dst_ptr, int dst_width) { |
michael@0 | 669 | asm volatile ( |
michael@0 | 670 | "movdqa %0,%%xmm2 \n" |
michael@0 | 671 | "movdqa %1,%%xmm3 \n" |
michael@0 | 672 | "movdqa %2,%%xmm4 \n" |
michael@0 | 673 | "pxor %%xmm5,%%xmm5 \n" |
michael@0 | 674 | : |
michael@0 | 675 | : "m"(kShufAc), // %0 |
michael@0 | 676 | "m"(kShufAc3), // %1 |
michael@0 | 677 | "m"(kScaleAc33) // %2 |
michael@0 | 678 | ); |
michael@0 | 679 | asm volatile ( |
michael@0 | 680 | LABELALIGN |
michael@0 | 681 | "1: \n" |
michael@0 | 682 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 683 | MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6 |
michael@0 | 684 | "movhlps %%xmm0,%%xmm1 \n" |
michael@0 | 685 | "movhlps %%xmm6,%%xmm7 \n" |
michael@0 | 686 | "punpcklbw %%xmm5,%%xmm0 \n" |
michael@0 | 687 | "punpcklbw %%xmm5,%%xmm1 \n" |
michael@0 | 688 | "punpcklbw %%xmm5,%%xmm6 \n" |
michael@0 | 689 | "punpcklbw %%xmm5,%%xmm7 \n" |
michael@0 | 690 | "paddusw %%xmm6,%%xmm0 \n" |
michael@0 | 691 | "paddusw %%xmm7,%%xmm1 \n" |
michael@0 | 692 | MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6 |
michael@0 | 693 | "lea " MEMLEA(0x10,0) ",%0 \n" |
michael@0 | 694 | "movhlps %%xmm6,%%xmm7 \n" |
michael@0 | 695 | "punpcklbw %%xmm5,%%xmm6 \n" |
michael@0 | 696 | "punpcklbw %%xmm5,%%xmm7 \n" |
michael@0 | 697 | "paddusw %%xmm6,%%xmm0 \n" |
michael@0 | 698 | "paddusw %%xmm7,%%xmm1 \n" |
michael@0 | 699 | "movdqa %%xmm0,%%xmm6 \n" |
michael@0 | 700 | "psrldq $0x2,%%xmm0 \n" |
michael@0 | 701 | "paddusw %%xmm0,%%xmm6 \n" |
michael@0 | 702 | "psrldq $0x2,%%xmm0 \n" |
michael@0 | 703 | "paddusw %%xmm0,%%xmm6 \n" |
michael@0 | 704 | "pshufb %%xmm2,%%xmm6 \n" |
michael@0 | 705 | "movdqa %%xmm1,%%xmm7 \n" |
michael@0 | 706 | "psrldq $0x2,%%xmm1 \n" |
michael@0 | 707 | "paddusw %%xmm1,%%xmm7 \n" |
michael@0 | 708 | "psrldq $0x2,%%xmm1 \n" |
michael@0 | 709 | "paddusw %%xmm1,%%xmm7 \n" |
michael@0 | 710 | "pshufb %%xmm3,%%xmm7 \n" |
michael@0 | 711 | "paddusw %%xmm7,%%xmm6 \n" |
michael@0 | 712 | "pmulhuw %%xmm4,%%xmm6 \n" |
michael@0 | 713 | "packuswb %%xmm6,%%xmm6 \n" |
michael@0 | 714 | "sub $0x6,%2 \n" |
michael@0 | 715 | "movd %%xmm6," MEMACCESS(1) " \n" |
michael@0 | 716 | "psrlq $0x10,%%xmm6 \n" |
michael@0 | 717 | "movd %%xmm6," MEMACCESS2(0x2,1) " \n" |
michael@0 | 718 | "lea " MEMLEA(0x6,1) ",%1 \n" |
michael@0 | 719 | "jg 1b \n" |
michael@0 | 720 | : "+r"(src_ptr), // %0 |
michael@0 | 721 | "+r"(dst_ptr), // %1 |
michael@0 | 722 | "+r"(dst_width) // %2 |
michael@0 | 723 | : "r"((intptr_t)(src_stride)) // %3 |
michael@0 | 724 | : "memory", "cc" |
michael@0 | 725 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 726 | , "r14" |
michael@0 | 727 | #endif |
michael@0 | 728 | #if defined(__SSE2__) |
michael@0 | 729 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
michael@0 | 730 | #endif |
michael@0 | 731 | ); |
michael@0 | 732 | } |
michael@0 | 733 | |
michael@0 | 734 | void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
michael@0 | 735 | uint16* dst_ptr, int src_width, int src_height) { |
michael@0 | 736 | int tmp_height = 0; |
michael@0 | 737 | intptr_t tmp_src = 0; |
michael@0 | 738 | asm volatile ( |
michael@0 | 739 | "pxor %%xmm4,%%xmm4 \n" |
michael@0 | 740 | "sub $0x1,%5 \n" |
michael@0 | 741 | |
michael@0 | 742 | LABELALIGN |
michael@0 | 743 | "1: \n" |
michael@0 | 744 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 745 | "mov %0,%3 \n" |
michael@0 | 746 | "add %6,%0 \n" |
michael@0 | 747 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 748 | "punpcklbw %%xmm4,%%xmm0 \n" |
michael@0 | 749 | "punpckhbw %%xmm4,%%xmm1 \n" |
michael@0 | 750 | "mov %5,%2 \n" |
michael@0 | 751 | "test %2,%2 \n" |
michael@0 | 752 | "je 3f \n" |
michael@0 | 753 | |
michael@0 | 754 | LABELALIGN |
michael@0 | 755 | "2: \n" |
michael@0 | 756 | "movdqa " MEMACCESS(0) ",%%xmm2 \n" |
michael@0 | 757 | "add %6,%0 \n" |
michael@0 | 758 | "movdqa %%xmm2,%%xmm3 \n" |
michael@0 | 759 | "punpcklbw %%xmm4,%%xmm2 \n" |
michael@0 | 760 | "punpckhbw %%xmm4,%%xmm3 \n" |
michael@0 | 761 | "paddusw %%xmm2,%%xmm0 \n" |
michael@0 | 762 | "paddusw %%xmm3,%%xmm1 \n" |
michael@0 | 763 | "sub $0x1,%2 \n" |
michael@0 | 764 | "jg 2b \n" |
michael@0 | 765 | |
michael@0 | 766 | LABELALIGN |
michael@0 | 767 | "3: \n" |
michael@0 | 768 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 769 | "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" |
michael@0 | 770 | "lea " MEMLEA(0x10,3) ",%0 \n" |
michael@0 | 771 | "lea " MEMLEA(0x20,1) ",%1 \n" |
michael@0 | 772 | "sub $0x10,%4 \n" |
michael@0 | 773 | "jg 1b \n" |
michael@0 | 774 | : "+r"(src_ptr), // %0 |
michael@0 | 775 | "+r"(dst_ptr), // %1 |
michael@0 | 776 | "+r"(tmp_height), // %2 |
michael@0 | 777 | "+r"(tmp_src), // %3 |
michael@0 | 778 | "+r"(src_width), // %4 |
michael@0 | 779 | "+rm"(src_height) // %5 |
michael@0 | 780 | : "rm"((intptr_t)(src_stride)) // %6 |
michael@0 | 781 | : "memory", "cc" |
michael@0 | 782 | #if defined(__SSE2__) |
michael@0 | 783 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
michael@0 | 784 | #endif |
michael@0 | 785 | ); |
michael@0 | 786 | } |
michael@0 | 787 | |
michael@0 | 788 | // Bilinear column filtering. SSSE3 version. |
michael@0 | 789 | void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
michael@0 | 790 | int dst_width, int x, int dx) { |
michael@0 | 791 | intptr_t x0 = 0, x1 = 0, temp_pixel = 0; |
michael@0 | 792 | asm volatile ( |
michael@0 | 793 | "movd %6,%%xmm2 \n" |
michael@0 | 794 | "movd %7,%%xmm3 \n" |
michael@0 | 795 | "movl $0x04040000,%k2 \n" |
michael@0 | 796 | "movd %k2,%%xmm5 \n" |
michael@0 | 797 | "pcmpeqb %%xmm6,%%xmm6 \n" |
michael@0 | 798 | "psrlw $0x9,%%xmm6 \n" |
michael@0 | 799 | "pextrw $0x1,%%xmm2,%k3 \n" |
michael@0 | 800 | "subl $0x2,%5 \n" |
michael@0 | 801 | "jl 29f \n" |
michael@0 | 802 | "movdqa %%xmm2,%%xmm0 \n" |
michael@0 | 803 | "paddd %%xmm3,%%xmm0 \n" |
michael@0 | 804 | "punpckldq %%xmm0,%%xmm2 \n" |
michael@0 | 805 | "punpckldq %%xmm3,%%xmm3 \n" |
michael@0 | 806 | "paddd %%xmm3,%%xmm3 \n" |
michael@0 | 807 | "pextrw $0x3,%%xmm2,%k4 \n" |
michael@0 | 808 | |
michael@0 | 809 | LABELALIGN |
michael@0 | 810 | "2: \n" |
michael@0 | 811 | "movdqa %%xmm2,%%xmm1 \n" |
michael@0 | 812 | "paddd %%xmm3,%%xmm2 \n" |
michael@0 | 813 | MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 |
michael@0 | 814 | "movd %k2,%%xmm0 \n" |
michael@0 | 815 | "psrlw $0x9,%%xmm1 \n" |
michael@0 | 816 | BUNDLEALIGN |
michael@0 | 817 | MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2 |
michael@0 | 818 | "movd %k2,%%xmm4 \n" |
michael@0 | 819 | "pshufb %%xmm5,%%xmm1 \n" |
michael@0 | 820 | "punpcklwd %%xmm4,%%xmm0 \n" |
michael@0 | 821 | "pxor %%xmm6,%%xmm1 \n" |
michael@0 | 822 | "pmaddubsw %%xmm1,%%xmm0 \n" |
michael@0 | 823 | "pextrw $0x1,%%xmm2,%k3 \n" |
michael@0 | 824 | "pextrw $0x3,%%xmm2,%k4 \n" |
michael@0 | 825 | "psrlw $0x7,%%xmm0 \n" |
michael@0 | 826 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 827 | "movd %%xmm0,%k2 \n" |
michael@0 | 828 | "mov %w2," MEMACCESS(0) " \n" |
michael@0 | 829 | "lea " MEMLEA(0x2,0) ",%0 \n" |
michael@0 | 830 | "sub $0x2,%5 \n" |
michael@0 | 831 | "jge 2b \n" |
michael@0 | 832 | |
michael@0 | 833 | LABELALIGN |
michael@0 | 834 | "29: \n" |
michael@0 | 835 | "addl $0x1,%5 \n" |
michael@0 | 836 | "jl 99f \n" |
michael@0 | 837 | MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 |
michael@0 | 838 | "movd %k2,%%xmm0 \n" |
michael@0 | 839 | "psrlw $0x9,%%xmm2 \n" |
michael@0 | 840 | "pshufb %%xmm5,%%xmm2 \n" |
michael@0 | 841 | "pxor %%xmm6,%%xmm2 \n" |
michael@0 | 842 | "pmaddubsw %%xmm2,%%xmm0 \n" |
michael@0 | 843 | "psrlw $0x7,%%xmm0 \n" |
michael@0 | 844 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 845 | "movd %%xmm0,%k2 \n" |
michael@0 | 846 | "mov %b2," MEMACCESS(0) " \n" |
michael@0 | 847 | "99: \n" |
michael@0 | 848 | : "+r"(dst_ptr), // %0 |
michael@0 | 849 | "+r"(src_ptr), // %1 |
michael@0 | 850 | "+a"(temp_pixel), // %2 |
michael@0 | 851 | "+r"(x0), // %3 |
michael@0 | 852 | "+r"(x1), // %4 |
michael@0 | 853 | "+rm"(dst_width) // %5 |
michael@0 | 854 | : "rm"(x), // %6 |
michael@0 | 855 | "rm"(dx) // %7 |
michael@0 | 856 | : "memory", "cc" |
michael@0 | 857 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 858 | , "r14" |
michael@0 | 859 | #endif |
michael@0 | 860 | #if defined(__SSE2__) |
michael@0 | 861 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
michael@0 | 862 | #endif |
michael@0 | 863 | ); |
michael@0 | 864 | } |
michael@0 | 865 | |
michael@0 | 866 | // Reads 4 pixels, duplicates them and writes 8 pixels. |
michael@0 | 867 | // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
michael@0 | 868 | void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
michael@0 | 869 | int dst_width, int x, int dx) { |
michael@0 | 870 | asm volatile ( |
michael@0 | 871 | LABELALIGN |
michael@0 | 872 | "1: \n" |
michael@0 | 873 | "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 874 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 875 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 876 | "punpcklbw %%xmm0,%%xmm0 \n" |
michael@0 | 877 | "punpckhbw %%xmm1,%%xmm1 \n" |
michael@0 | 878 | "sub $0x20,%2 \n" |
michael@0 | 879 | "movdqa %%xmm0," MEMACCESS(0) " \n" |
michael@0 | 880 | "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" |
michael@0 | 881 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 882 | "jg 1b \n" |
michael@0 | 883 | |
michael@0 | 884 | : "+r"(dst_ptr), // %0 |
michael@0 | 885 | "+r"(src_ptr), // %1 |
michael@0 | 886 | "+r"(dst_width) // %2 |
michael@0 | 887 | : |
michael@0 | 888 | : "memory", "cc" |
michael@0 | 889 | #if defined(__SSE2__) |
michael@0 | 890 | , "xmm0", "xmm1" |
michael@0 | 891 | #endif |
michael@0 | 892 | ); |
michael@0 | 893 | } |
michael@0 | 894 | |
michael@0 | 895 | void ScaleARGBRowDown2_SSE2(const uint8* src_argb, |
michael@0 | 896 | ptrdiff_t src_stride, |
michael@0 | 897 | uint8* dst_argb, int dst_width) { |
michael@0 | 898 | asm volatile ( |
michael@0 | 899 | LABELALIGN |
michael@0 | 900 | "1: \n" |
michael@0 | 901 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 902 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 903 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 904 | "shufps $0xdd,%%xmm1,%%xmm0 \n" |
michael@0 | 905 | "sub $0x4,%2 \n" |
michael@0 | 906 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 907 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 908 | "jg 1b \n" |
michael@0 | 909 | : "+r"(src_argb), // %0 |
michael@0 | 910 | "+r"(dst_argb), // %1 |
michael@0 | 911 | "+r"(dst_width) // %2 |
michael@0 | 912 | : |
michael@0 | 913 | : "memory", "cc" |
michael@0 | 914 | #if defined(__SSE2__) |
michael@0 | 915 | , "xmm0", "xmm1" |
michael@0 | 916 | #endif |
michael@0 | 917 | ); |
michael@0 | 918 | } |
michael@0 | 919 | |
michael@0 | 920 | void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, |
michael@0 | 921 | ptrdiff_t src_stride, |
michael@0 | 922 | uint8* dst_argb, int dst_width) { |
michael@0 | 923 | asm volatile ( |
michael@0 | 924 | LABELALIGN |
michael@0 | 925 | "1: \n" |
michael@0 | 926 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 927 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 928 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 929 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 930 | "shufps $0x88,%%xmm1,%%xmm0 \n" |
michael@0 | 931 | "shufps $0xdd,%%xmm1,%%xmm2 \n" |
michael@0 | 932 | "pavgb %%xmm2,%%xmm0 \n" |
michael@0 | 933 | "sub $0x4,%2 \n" |
michael@0 | 934 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 935 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 936 | "jg 1b \n" |
michael@0 | 937 | : "+r"(src_argb), // %0 |
michael@0 | 938 | "+r"(dst_argb), // %1 |
michael@0 | 939 | "+r"(dst_width) // %2 |
michael@0 | 940 | : |
michael@0 | 941 | : "memory", "cc" |
michael@0 | 942 | #if defined(__SSE2__) |
michael@0 | 943 | , "xmm0", "xmm1" |
michael@0 | 944 | #endif |
michael@0 | 945 | ); |
michael@0 | 946 | } |
michael@0 | 947 | |
michael@0 | 948 | void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, |
michael@0 | 949 | ptrdiff_t src_stride, |
michael@0 | 950 | uint8* dst_argb, int dst_width) { |
michael@0 | 951 | asm volatile ( |
michael@0 | 952 | LABELALIGN |
michael@0 | 953 | "1: \n" |
michael@0 | 954 | "movdqa " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 955 | "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" |
michael@0 | 956 | BUNDLEALIGN |
michael@0 | 957 | MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 |
michael@0 | 958 | MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 |
michael@0 | 959 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 960 | "pavgb %%xmm2,%%xmm0 \n" |
michael@0 | 961 | "pavgb %%xmm3,%%xmm1 \n" |
michael@0 | 962 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 963 | "shufps $0x88,%%xmm1,%%xmm0 \n" |
michael@0 | 964 | "shufps $0xdd,%%xmm1,%%xmm2 \n" |
michael@0 | 965 | "pavgb %%xmm2,%%xmm0 \n" |
michael@0 | 966 | "sub $0x4,%2 \n" |
michael@0 | 967 | "movdqa %%xmm0," MEMACCESS(1) " \n" |
michael@0 | 968 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 969 | "jg 1b \n" |
michael@0 | 970 | : "+r"(src_argb), // %0 |
michael@0 | 971 | "+r"(dst_argb), // %1 |
michael@0 | 972 | "+r"(dst_width) // %2 |
michael@0 | 973 | : "r"((intptr_t)(src_stride)) // %3 |
michael@0 | 974 | : "memory", "cc" |
michael@0 | 975 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 976 | , "r14" |
michael@0 | 977 | #endif |
michael@0 | 978 | #if defined(__SSE2__) |
michael@0 | 979 | , "xmm0", "xmm1", "xmm2", "xmm3" |
michael@0 | 980 | #endif |
michael@0 | 981 | ); |
michael@0 | 982 | } |
michael@0 | 983 | |
michael@0 | 984 | // Reads 4 pixels at a time. |
michael@0 | 985 | // Alignment requirement: dst_argb 16 byte aligned. |
michael@0 | 986 | void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, |
michael@0 | 987 | int src_stepx, |
michael@0 | 988 | uint8* dst_argb, int dst_width) { |
michael@0 | 989 | intptr_t src_stepx_x4 = (intptr_t)(src_stepx); |
michael@0 | 990 | intptr_t src_stepx_x12 = 0; |
michael@0 | 991 | asm volatile ( |
michael@0 | 992 | "lea " MEMLEA3(0x00,1,4) ",%1 \n" |
michael@0 | 993 | "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" |
michael@0 | 994 | LABELALIGN |
michael@0 | 995 | "1: \n" |
michael@0 | 996 | "movd " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 997 | MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 |
michael@0 | 998 | "punpckldq %%xmm1,%%xmm0 \n" |
michael@0 | 999 | BUNDLEALIGN |
michael@0 | 1000 | MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 |
michael@0 | 1001 | MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 |
michael@0 | 1002 | "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" |
michael@0 | 1003 | "punpckldq %%xmm3,%%xmm2 \n" |
michael@0 | 1004 | "punpcklqdq %%xmm2,%%xmm0 \n" |
michael@0 | 1005 | "sub $0x4,%3 \n" |
michael@0 | 1006 | "movdqa %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 1007 | "lea " MEMLEA(0x10,2) ",%2 \n" |
michael@0 | 1008 | "jg 1b \n" |
michael@0 | 1009 | : "+r"(src_argb), // %0 |
michael@0 | 1010 | "+r"(src_stepx_x4), // %1 |
michael@0 | 1011 | "+r"(dst_argb), // %2 |
michael@0 | 1012 | "+r"(dst_width), // %3 |
michael@0 | 1013 | "+r"(src_stepx_x12) // %4 |
michael@0 | 1014 | : |
michael@0 | 1015 | : "memory", "cc" |
michael@0 | 1016 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 1017 | , "r14" |
michael@0 | 1018 | #endif |
michael@0 | 1019 | #if defined(__SSE2__) |
michael@0 | 1020 | , "xmm0", "xmm1", "xmm2", "xmm3" |
michael@0 | 1021 | #endif |
michael@0 | 1022 | ); |
michael@0 | 1023 | } |
michael@0 | 1024 | |
michael@0 | 1025 | // Blends four 2x2 to 4x1. |
michael@0 | 1026 | // Alignment requirement: dst_argb 16 byte aligned. |
michael@0 | 1027 | void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, |
michael@0 | 1028 | ptrdiff_t src_stride, int src_stepx, |
michael@0 | 1029 | uint8* dst_argb, int dst_width) { |
michael@0 | 1030 | intptr_t src_stepx_x4 = (intptr_t)(src_stepx); |
michael@0 | 1031 | intptr_t src_stepx_x12 = 0; |
michael@0 | 1032 | intptr_t row1 = (intptr_t)(src_stride); |
michael@0 | 1033 | asm volatile ( |
michael@0 | 1034 | "lea " MEMLEA3(0x00,1,4) ",%1 \n" |
michael@0 | 1035 | "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" |
michael@0 | 1036 | "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" |
michael@0 | 1037 | |
michael@0 | 1038 | LABELALIGN |
michael@0 | 1039 | "1: \n" |
michael@0 | 1040 | "movq " MEMACCESS(0) ",%%xmm0 \n" |
michael@0 | 1041 | MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 |
michael@0 | 1042 | MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 |
michael@0 | 1043 | BUNDLEALIGN |
michael@0 | 1044 | MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 |
michael@0 | 1045 | "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" |
michael@0 | 1046 | "movq " MEMACCESS(5) ",%%xmm2 \n" |
michael@0 | 1047 | BUNDLEALIGN |
michael@0 | 1048 | MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 |
michael@0 | 1049 | MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 |
michael@0 | 1050 | MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 |
michael@0 | 1051 | "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" |
michael@0 | 1052 | "pavgb %%xmm2,%%xmm0 \n" |
michael@0 | 1053 | "pavgb %%xmm3,%%xmm1 \n" |
michael@0 | 1054 | "movdqa %%xmm0,%%xmm2 \n" |
michael@0 | 1055 | "shufps $0x88,%%xmm1,%%xmm0 \n" |
michael@0 | 1056 | "shufps $0xdd,%%xmm1,%%xmm2 \n" |
michael@0 | 1057 | "pavgb %%xmm2,%%xmm0 \n" |
michael@0 | 1058 | "sub $0x4,%3 \n" |
michael@0 | 1059 | "movdqa %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 1060 | "lea " MEMLEA(0x10,2) ",%2 \n" |
michael@0 | 1061 | "jg 1b \n" |
michael@0 | 1062 | : "+r"(src_argb), // %0 |
michael@0 | 1063 | "+r"(src_stepx_x4), // %1 |
michael@0 | 1064 | "+r"(dst_argb), // %2 |
michael@0 | 1065 | "+rm"(dst_width), // %3 |
michael@0 | 1066 | "+r"(src_stepx_x12), // %4 |
michael@0 | 1067 | "+r"(row1) // %5 |
michael@0 | 1068 | : |
michael@0 | 1069 | : "memory", "cc" |
michael@0 | 1070 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 1071 | , "r14" |
michael@0 | 1072 | #endif |
michael@0 | 1073 | #if defined(__SSE2__) |
michael@0 | 1074 | , "xmm0", "xmm1", "xmm2", "xmm3" |
michael@0 | 1075 | #endif |
michael@0 | 1076 | ); |
michael@0 | 1077 | } |
michael@0 | 1078 | |
michael@0 | 1079 | void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, |
michael@0 | 1080 | int dst_width, int x, int dx) { |
michael@0 | 1081 | intptr_t x0 = 0, x1 = 0; |
michael@0 | 1082 | asm volatile ( |
michael@0 | 1083 | "movd %5,%%xmm2 \n" |
michael@0 | 1084 | "movd %6,%%xmm3 \n" |
michael@0 | 1085 | "pshufd $0x0,%%xmm2,%%xmm2 \n" |
michael@0 | 1086 | "pshufd $0x11,%%xmm3,%%xmm0 \n" |
michael@0 | 1087 | "paddd %%xmm0,%%xmm2 \n" |
michael@0 | 1088 | "paddd %%xmm3,%%xmm3 \n" |
michael@0 | 1089 | "pshufd $0x5,%%xmm3,%%xmm0 \n" |
michael@0 | 1090 | "paddd %%xmm0,%%xmm2 \n" |
michael@0 | 1091 | "paddd %%xmm3,%%xmm3 \n" |
michael@0 | 1092 | "pshufd $0x0,%%xmm3,%%xmm3 \n" |
michael@0 | 1093 | "pextrw $0x1,%%xmm2,%k0 \n" |
michael@0 | 1094 | "pextrw $0x3,%%xmm2,%k1 \n" |
michael@0 | 1095 | "cmp $0x0,%4 \n" |
michael@0 | 1096 | "jl 99f \n" |
michael@0 | 1097 | "sub $0x4,%4 \n" |
michael@0 | 1098 | "jl 49f \n" |
michael@0 | 1099 | |
michael@0 | 1100 | LABELALIGN |
michael@0 | 1101 | "40: \n" |
michael@0 | 1102 | MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 |
michael@0 | 1103 | MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 |
michael@0 | 1104 | "pextrw $0x5,%%xmm2,%k0 \n" |
michael@0 | 1105 | "pextrw $0x7,%%xmm2,%k1 \n" |
michael@0 | 1106 | "paddd %%xmm3,%%xmm2 \n" |
michael@0 | 1107 | "punpckldq %%xmm1,%%xmm0 \n" |
michael@0 | 1108 | MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1 |
michael@0 | 1109 | MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4 |
michael@0 | 1110 | "pextrw $0x1,%%xmm2,%k0 \n" |
michael@0 | 1111 | "pextrw $0x3,%%xmm2,%k1 \n" |
michael@0 | 1112 | "punpckldq %%xmm4,%%xmm1 \n" |
michael@0 | 1113 | "punpcklqdq %%xmm1,%%xmm0 \n" |
michael@0 | 1114 | "sub $0x4,%4 \n" |
michael@0 | 1115 | "movdqu %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 1116 | "lea " MEMLEA(0x10,2) ",%2 \n" |
michael@0 | 1117 | "jge 40b \n" |
michael@0 | 1118 | |
michael@0 | 1119 | "49: \n" |
michael@0 | 1120 | "test $0x2,%4 \n" |
michael@0 | 1121 | "je 29f \n" |
michael@0 | 1122 | BUNDLEALIGN |
michael@0 | 1123 | MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 |
michael@0 | 1124 | MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 |
michael@0 | 1125 | "pextrw $0x5,%%xmm2,%k0 \n" |
michael@0 | 1126 | "punpckldq %%xmm1,%%xmm0 \n" |
michael@0 | 1127 | "movq %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 1128 | "lea " MEMLEA(0x8,2) ",%2 \n" |
michael@0 | 1129 | "29: \n" |
michael@0 | 1130 | "test $0x1,%4 \n" |
michael@0 | 1131 | "je 99f \n" |
michael@0 | 1132 | MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 |
michael@0 | 1133 | "movd %%xmm0," MEMACCESS(2) " \n" |
michael@0 | 1134 | "99: \n" |
michael@0 | 1135 | : "+a"(x0), // %0 |
michael@0 | 1136 | "+d"(x1), // %1 |
michael@0 | 1137 | "+r"(dst_argb), // %2 |
michael@0 | 1138 | "+r"(src_argb), // %3 |
michael@0 | 1139 | "+r"(dst_width) // %4 |
michael@0 | 1140 | : "rm"(x), // %5 |
michael@0 | 1141 | "rm"(dx) // %6 |
michael@0 | 1142 | : "memory", "cc" |
michael@0 | 1143 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 1144 | , "r14" |
michael@0 | 1145 | #endif |
michael@0 | 1146 | #if defined(__SSE2__) |
michael@0 | 1147 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
michael@0 | 1148 | #endif |
michael@0 | 1149 | ); |
michael@0 | 1150 | } |
michael@0 | 1151 | |
michael@0 | 1152 | // Reads 4 pixels, duplicates them and writes 8 pixels. |
michael@0 | 1153 | // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
michael@0 | 1154 | void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, |
michael@0 | 1155 | int dst_width, int x, int dx) { |
michael@0 | 1156 | asm volatile ( |
michael@0 | 1157 | LABELALIGN |
michael@0 | 1158 | "1: \n" |
michael@0 | 1159 | "movdqa " MEMACCESS(1) ",%%xmm0 \n" |
michael@0 | 1160 | "lea " MEMLEA(0x10,1) ",%1 \n" |
michael@0 | 1161 | "movdqa %%xmm0,%%xmm1 \n" |
michael@0 | 1162 | "punpckldq %%xmm0,%%xmm0 \n" |
michael@0 | 1163 | "punpckhdq %%xmm1,%%xmm1 \n" |
michael@0 | 1164 | "sub $0x8,%2 \n" |
michael@0 | 1165 | "movdqa %%xmm0," MEMACCESS(0) " \n" |
michael@0 | 1166 | "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" |
michael@0 | 1167 | "lea " MEMLEA(0x20,0) ",%0 \n" |
michael@0 | 1168 | "jg 1b \n" |
michael@0 | 1169 | |
michael@0 | 1170 | : "+r"(dst_argb), // %0 |
michael@0 | 1171 | "+r"(src_argb), // %1 |
michael@0 | 1172 | "+r"(dst_width) // %2 |
michael@0 | 1173 | : |
michael@0 | 1174 | : "memory", "cc" |
michael@0 | 1175 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 1176 | , "r14" |
michael@0 | 1177 | #endif |
michael@0 | 1178 | #if defined(__SSE2__) |
michael@0 | 1179 | , "xmm0", "xmm1" |
michael@0 | 1180 | #endif |
michael@0 | 1181 | ); |
michael@0 | 1182 | } |
michael@0 | 1183 | |
michael@0 | 1184 | // Shuffle table for arranging 2 pixels into pairs for pmaddubsw |
michael@0 | 1185 | static uvec8 kShuffleColARGB = { |
michael@0 | 1186 | 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel |
michael@0 | 1187 | 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel |
michael@0 | 1188 | }; |
michael@0 | 1189 | |
michael@0 | 1190 | // Shuffle table for duplicating 2 fractions into 8 bytes each |
michael@0 | 1191 | static uvec8 kShuffleFractions = { |
michael@0 | 1192 | 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, |
michael@0 | 1193 | }; |
michael@0 | 1194 | |
michael@0 | 1195 | // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version |
michael@0 | 1196 | void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, |
michael@0 | 1197 | int dst_width, int x, int dx) { |
michael@0 | 1198 | intptr_t x0 = 0, x1 = 0; |
michael@0 | 1199 | asm volatile ( |
michael@0 | 1200 | "movdqa %0,%%xmm4 \n" |
michael@0 | 1201 | "movdqa %1,%%xmm5 \n" |
michael@0 | 1202 | : |
michael@0 | 1203 | : "m"(kShuffleColARGB), // %0 |
michael@0 | 1204 | "m"(kShuffleFractions) // %1 |
michael@0 | 1205 | ); |
michael@0 | 1206 | |
michael@0 | 1207 | asm volatile ( |
michael@0 | 1208 | "movd %5,%%xmm2 \n" |
michael@0 | 1209 | "movd %6,%%xmm3 \n" |
michael@0 | 1210 | "pcmpeqb %%xmm6,%%xmm6 \n" |
michael@0 | 1211 | "psrlw $0x9,%%xmm6 \n" |
michael@0 | 1212 | "pextrw $0x1,%%xmm2,%k3 \n" |
michael@0 | 1213 | "sub $0x2,%2 \n" |
michael@0 | 1214 | "jl 29f \n" |
michael@0 | 1215 | "movdqa %%xmm2,%%xmm0 \n" |
michael@0 | 1216 | "paddd %%xmm3,%%xmm0 \n" |
michael@0 | 1217 | "punpckldq %%xmm0,%%xmm2 \n" |
michael@0 | 1218 | "punpckldq %%xmm3,%%xmm3 \n" |
michael@0 | 1219 | "paddd %%xmm3,%%xmm3 \n" |
michael@0 | 1220 | "pextrw $0x3,%%xmm2,%k4 \n" |
michael@0 | 1221 | |
michael@0 | 1222 | LABELALIGN |
michael@0 | 1223 | "2: \n" |
michael@0 | 1224 | "movdqa %%xmm2,%%xmm1 \n" |
michael@0 | 1225 | "paddd %%xmm3,%%xmm2 \n" |
michael@0 | 1226 | MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 |
michael@0 | 1227 | "psrlw $0x9,%%xmm1 \n" |
michael@0 | 1228 | BUNDLEALIGN |
michael@0 | 1229 | MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0 |
michael@0 | 1230 | "pshufb %%xmm5,%%xmm1 \n" |
michael@0 | 1231 | "pshufb %%xmm4,%%xmm0 \n" |
michael@0 | 1232 | "pxor %%xmm6,%%xmm1 \n" |
michael@0 | 1233 | "pmaddubsw %%xmm1,%%xmm0 \n" |
michael@0 | 1234 | "psrlw $0x7,%%xmm0 \n" |
michael@0 | 1235 | "pextrw $0x1,%%xmm2,%k3 \n" |
michael@0 | 1236 | "pextrw $0x3,%%xmm2,%k4 \n" |
michael@0 | 1237 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 1238 | "movq %%xmm0," MEMACCESS(0) " \n" |
michael@0 | 1239 | "lea " MEMLEA(0x8,0) ",%0 \n" |
michael@0 | 1240 | "sub $0x2,%2 \n" |
michael@0 | 1241 | "jge 2b \n" |
michael@0 | 1242 | |
michael@0 | 1243 | LABELALIGN |
michael@0 | 1244 | "29: \n" |
michael@0 | 1245 | "add $0x1,%2 \n" |
michael@0 | 1246 | "jl 99f \n" |
michael@0 | 1247 | "psrlw $0x9,%%xmm2 \n" |
michael@0 | 1248 | BUNDLEALIGN |
michael@0 | 1249 | MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 |
michael@0 | 1250 | "pshufb %%xmm5,%%xmm2 \n" |
michael@0 | 1251 | "pshufb %%xmm4,%%xmm0 \n" |
michael@0 | 1252 | "pxor %%xmm6,%%xmm2 \n" |
michael@0 | 1253 | "pmaddubsw %%xmm2,%%xmm0 \n" |
michael@0 | 1254 | "psrlw $0x7,%%xmm0 \n" |
michael@0 | 1255 | "packuswb %%xmm0,%%xmm0 \n" |
michael@0 | 1256 | "movd %%xmm0," MEMACCESS(0) " \n" |
michael@0 | 1257 | |
michael@0 | 1258 | LABELALIGN |
michael@0 | 1259 | "99: \n" |
michael@0 | 1260 | : "+r"(dst_argb), // %0 |
michael@0 | 1261 | "+r"(src_argb), // %1 |
michael@0 | 1262 | "+rm"(dst_width), // %2 |
michael@0 | 1263 | "+r"(x0), // %3 |
michael@0 | 1264 | "+r"(x1) // %4 |
michael@0 | 1265 | : "rm"(x), // %5 |
michael@0 | 1266 | "rm"(dx) // %6 |
michael@0 | 1267 | : "memory", "cc" |
michael@0 | 1268 | #if defined(__native_client__) && defined(__x86_64__) |
michael@0 | 1269 | , "r14" |
michael@0 | 1270 | #endif |
michael@0 | 1271 | #if defined(__SSE2__) |
michael@0 | 1272 | , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
michael@0 | 1273 | #endif |
michael@0 | 1274 | ); |
michael@0 | 1275 | } |
michael@0 | 1276 | |
michael@0 | 1277 | // Divide num by div and return as 16.16 fixed point result. |
michael@0 | 1278 | int FixedDiv_X86(int num, int div) { |
michael@0 | 1279 | asm volatile ( |
michael@0 | 1280 | "cdq \n" |
michael@0 | 1281 | "shld $0x10,%%eax,%%edx \n" |
michael@0 | 1282 | "shl $0x10,%%eax \n" |
michael@0 | 1283 | "idiv %1 \n" |
michael@0 | 1284 | "mov %0, %%eax \n" |
michael@0 | 1285 | : "+a"(num) // %0 |
michael@0 | 1286 | : "c"(div) // %1 |
michael@0 | 1287 | : "memory", "cc", "edx" |
michael@0 | 1288 | ); |
michael@0 | 1289 | return num; |
michael@0 | 1290 | } |
michael@0 | 1291 | |
michael@0 | 1292 | // Divide num - 1 by div - 1 and return as 16.16 fixed point result. |
michael@0 | 1293 | int FixedDiv1_X86(int num, int div) { |
michael@0 | 1294 | asm volatile ( |
michael@0 | 1295 | "cdq \n" |
michael@0 | 1296 | "shld $0x10,%%eax,%%edx \n" |
michael@0 | 1297 | "shl $0x10,%%eax \n" |
michael@0 | 1298 | "sub $0x10001,%%eax \n" |
michael@0 | 1299 | "sbb $0x0,%%edx \n" |
michael@0 | 1300 | "sub $0x1,%1 \n" |
michael@0 | 1301 | "idiv %1 \n" |
michael@0 | 1302 | "mov %0, %%eax \n" |
michael@0 | 1303 | : "+a"(num) // %0 |
michael@0 | 1304 | : "c"(div) // %1 |
michael@0 | 1305 | : "memory", "cc", "edx" |
michael@0 | 1306 | ); |
michael@0 | 1307 | return num; |
michael@0 | 1308 | } |
michael@0 | 1309 | |
michael@0 | 1310 | #endif // defined(__x86_64__) || defined(__i386__) |
michael@0 | 1311 | |
michael@0 | 1312 | #ifdef __cplusplus |
michael@0 | 1313 | } // extern "C" |
michael@0 | 1314 | } // namespace libyuv |
michael@0 | 1315 | #endif |