1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libyuv/source/scale_posix.cc Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1315 @@ 1.4 +/* 1.5 + * Copyright 2013 The LibYuv Project Authors. All rights reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include "libyuv/row.h" 1.15 + 1.16 +#ifdef __cplusplus 1.17 +namespace libyuv { 1.18 +extern "C" { 1.19 +#endif 1.20 + 1.21 +// This module is for GCC x86 and x64. 1.22 +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) 1.23 + 1.24 +// Offsets for source bytes 0 to 9 1.25 +static uvec8 kShuf0 = 1.26 + { 0, 1, 3, 4, 5, 7, 8, 9, 128, 128, 128, 128, 128, 128, 128, 128 }; 1.27 + 1.28 +// Offsets for source bytes 11 to 20 with 8 subtracted = 3 to 12. 1.29 +static uvec8 kShuf1 = 1.30 + { 3, 4, 5, 7, 8, 9, 11, 12, 128, 128, 128, 128, 128, 128, 128, 128 }; 1.31 + 1.32 +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 1.33 +static uvec8 kShuf2 = 1.34 + { 5, 7, 8, 9, 11, 12, 13, 15, 128, 128, 128, 128, 128, 128, 128, 128 }; 1.35 + 1.36 +// Offsets for source bytes 0 to 10 1.37 +static uvec8 kShuf01 = 1.38 + { 0, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10 }; 1.39 + 1.40 +// Offsets for source bytes 10 to 21 with 8 subtracted = 3 to 13. 1.41 +static uvec8 kShuf11 = 1.42 + { 2, 3, 4, 5, 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13 }; 1.43 + 1.44 +// Offsets for source bytes 21 to 31 with 16 subtracted = 5 to 31. 1.45 +static uvec8 kShuf21 = 1.46 + { 5, 6, 6, 7, 8, 9, 9, 10, 10, 11, 12, 13, 13, 14, 14, 15 }; 1.47 + 1.48 +// Coefficients for source bytes 0 to 10 1.49 +static uvec8 kMadd01 = 1.50 + { 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2 }; 1.51 + 1.52 +// Coefficients for source bytes 10 to 21 1.53 +static uvec8 kMadd11 = 1.54 + { 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1 }; 1.55 + 1.56 +// Coefficients for source bytes 21 to 31 1.57 +static uvec8 kMadd21 = 1.58 + { 2, 2, 1, 3, 3, 1, 2, 2, 1, 3, 3, 1, 2, 2, 1, 3 }; 1.59 + 1.60 +// Coefficients for source bytes 21 to 31 1.61 +static vec16 kRound34 = 1.62 + { 2, 2, 2, 2, 2, 2, 2, 2 }; 1.63 + 1.64 +static uvec8 kShuf38a = 1.65 + { 0, 3, 6, 8, 11, 14, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 1.66 + 1.67 +static uvec8 kShuf38b = 1.68 + { 128, 128, 128, 128, 128, 128, 0, 3, 6, 8, 11, 14, 128, 128, 128, 128 }; 1.69 + 1.70 +// Arrange words 0,3,6 into 0,1,2 1.71 +static uvec8 kShufAc = 1.72 + { 0, 1, 6, 7, 12, 13, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; 1.73 + 1.74 +// Arrange words 0,3,6 into 3,4,5 1.75 +static uvec8 kShufAc3 = 1.76 + { 128, 128, 128, 128, 128, 128, 0, 1, 6, 7, 12, 13, 128, 128, 128, 128 }; 1.77 + 1.78 +// Scaling values for boxes of 3x3 and 2x3 1.79 +static uvec16 kScaleAc33 = 1.80 + { 65536 / 9, 65536 / 9, 65536 / 6, 65536 / 9, 65536 / 9, 65536 / 6, 0, 0 }; 1.81 + 1.82 +// Arrange first value for pixels 0,1,2,3,4,5 1.83 +static uvec8 kShufAb0 = 1.84 + { 0, 128, 3, 128, 6, 128, 8, 128, 11, 128, 14, 128, 128, 128, 128, 128 }; 1.85 + 1.86 +// Arrange second value for pixels 0,1,2,3,4,5 1.87 +static uvec8 kShufAb1 = 1.88 + { 1, 128, 4, 128, 7, 128, 9, 128, 12, 128, 15, 128, 128, 128, 128, 128 }; 1.89 + 1.90 +// Arrange third value for pixels 0,1,2,3,4,5 1.91 +static uvec8 kShufAb2 = 1.92 + { 2, 128, 5, 128, 128, 128, 10, 128, 13, 128, 128, 128, 128, 128, 128, 128 }; 1.93 + 1.94 +// Scaling values for boxes of 3x2 and 2x2 1.95 +static uvec16 kScaleAb2 = 1.96 + { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; 1.97 + 1.98 +// GCC versions of row functions are verbatim conversions from Visual C. 1.99 +// Generated using gcc disassembly on Visual C object file: 1.100 +// objdump -D yuvscaler.obj >yuvscaler.txt 1.101 + 1.102 +void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1.103 + uint8* dst_ptr, int dst_width) { 1.104 + asm volatile ( 1.105 + LABELALIGN 1.106 + "1: \n" 1.107 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.108 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.109 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.110 + "psrlw $0x8,%%xmm0 \n" 1.111 + "psrlw $0x8,%%xmm1 \n" 1.112 + "packuswb %%xmm1,%%xmm0 \n" 1.113 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.114 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.115 + "sub $0x10,%2 \n" 1.116 + "jg 1b \n" 1.117 + : "+r"(src_ptr), // %0 1.118 + "+r"(dst_ptr), // %1 1.119 + "+r"(dst_width) // %2 1.120 + : 1.121 + : "memory", "cc" 1.122 +#if defined(__SSE2__) 1.123 + , "xmm0", "xmm1" 1.124 +#endif 1.125 + ); 1.126 +} 1.127 + 1.128 +void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1.129 + uint8* dst_ptr, int dst_width) { 1.130 + asm volatile ( 1.131 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.132 + "psrlw $0x8,%%xmm5 \n" 1.133 + 1.134 + LABELALIGN 1.135 + "1: \n" 1.136 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.137 + "movdqa " MEMACCESS2(0x10, 0) ",%%xmm1 \n" 1.138 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.139 + "movdqa %%xmm0,%%xmm2 \n" 1.140 + "psrlw $0x8,%%xmm0 \n" 1.141 + "movdqa %%xmm1,%%xmm3 \n" 1.142 + "psrlw $0x8,%%xmm1 \n" 1.143 + "pand %%xmm5,%%xmm2 \n" 1.144 + "pand %%xmm5,%%xmm3 \n" 1.145 + "pavgw %%xmm2,%%xmm0 \n" 1.146 + "pavgw %%xmm3,%%xmm1 \n" 1.147 + "packuswb %%xmm1,%%xmm0 \n" 1.148 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.149 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.150 + "sub $0x10,%2 \n" 1.151 + "jg 1b \n" 1.152 + : "+r"(src_ptr), // %0 1.153 + "+r"(dst_ptr), // %1 1.154 + "+r"(dst_width) // %2 1.155 + : 1.156 + : "memory", "cc" 1.157 +#if defined(__SSE2__) 1.158 + , "xmm0", "xmm1", "xmm5" 1.159 +#endif 1.160 + ); 1.161 +} 1.162 + 1.163 +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1.164 + uint8* dst_ptr, int dst_width) { 1.165 + asm volatile ( 1.166 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.167 + "psrlw $0x8,%%xmm5 \n" 1.168 + 1.169 + LABELALIGN 1.170 + "1: \n" 1.171 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.172 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.173 + MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 1.174 + BUNDLEALIGN 1.175 + MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 1.176 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.177 + "pavgb %%xmm2,%%xmm0 \n" 1.178 + "pavgb %%xmm3,%%xmm1 \n" 1.179 + "movdqa %%xmm0,%%xmm2 \n" 1.180 + "psrlw $0x8,%%xmm0 \n" 1.181 + "movdqa %%xmm1,%%xmm3 \n" 1.182 + "psrlw $0x8,%%xmm1 \n" 1.183 + "pand %%xmm5,%%xmm2 \n" 1.184 + "pand %%xmm5,%%xmm3 \n" 1.185 + "pavgw %%xmm2,%%xmm0 \n" 1.186 + "pavgw %%xmm3,%%xmm1 \n" 1.187 + "packuswb %%xmm1,%%xmm0 \n" 1.188 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.189 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.190 + "sub $0x10,%2 \n" 1.191 + "jg 1b \n" 1.192 + : "+r"(src_ptr), // %0 1.193 + "+r"(dst_ptr), // %1 1.194 + "+r"(dst_width) // %2 1.195 + : "r"((intptr_t)(src_stride)) // %3 1.196 + : "memory", "cc" 1.197 +#if defined(__native_client__) && defined(__x86_64__) 1.198 + , "r14" 1.199 +#endif 1.200 +#if defined(__SSE2__) 1.201 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 1.202 +#endif 1.203 + ); 1.204 +} 1.205 + 1.206 +void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1.207 + uint8* dst_ptr, int dst_width) { 1.208 + asm volatile ( 1.209 + LABELALIGN 1.210 + "1: \n" 1.211 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.212 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.213 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.214 + "psrlw $0x8,%%xmm0 \n" 1.215 + "psrlw $0x8,%%xmm1 \n" 1.216 + "packuswb %%xmm1,%%xmm0 \n" 1.217 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.218 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.219 + "sub $0x10,%2 \n" 1.220 + "jg 1b \n" 1.221 + : "+r"(src_ptr), // %0 1.222 + "+r"(dst_ptr), // %1 1.223 + "+r"(dst_width) // %2 1.224 + : 1.225 + : "memory", "cc" 1.226 +#if defined(__SSE2__) 1.227 + , "xmm0", "xmm1" 1.228 +#endif 1.229 + ); 1.230 +} 1.231 + 1.232 +void ScaleRowDown2Linear_Unaligned_SSE2(const uint8* src_ptr, 1.233 + ptrdiff_t src_stride, 1.234 + uint8* dst_ptr, int dst_width) { 1.235 + asm volatile ( 1.236 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.237 + "psrlw $0x8,%%xmm5 \n" 1.238 + 1.239 + LABELALIGN 1.240 + "1: \n" 1.241 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.242 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.243 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.244 + "movdqa %%xmm0,%%xmm2 \n" 1.245 + "psrlw $0x8,%%xmm0 \n" 1.246 + "movdqa %%xmm1,%%xmm3 \n" 1.247 + "psrlw $0x8,%%xmm1 \n" 1.248 + "pand %%xmm5,%%xmm2 \n" 1.249 + "pand %%xmm5,%%xmm3 \n" 1.250 + "pavgw %%xmm2,%%xmm0 \n" 1.251 + "pavgw %%xmm3,%%xmm1 \n" 1.252 + "packuswb %%xmm1,%%xmm0 \n" 1.253 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.254 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.255 + "sub $0x10,%2 \n" 1.256 + "jg 1b \n" 1.257 + : "+r"(src_ptr), // %0 1.258 + "+r"(dst_ptr), // %1 1.259 + "+r"(dst_width) // %2 1.260 + : 1.261 + : "memory", "cc" 1.262 +#if defined(__SSE2__) 1.263 + , "xmm0", "xmm1", "xmm5" 1.264 +#endif 1.265 + ); 1.266 +} 1.267 + 1.268 +void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, 1.269 + ptrdiff_t src_stride, 1.270 + uint8* dst_ptr, int dst_width) { 1.271 + asm volatile ( 1.272 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.273 + "psrlw $0x8,%%xmm5 \n" 1.274 + 1.275 + LABELALIGN 1.276 + "1: \n" 1.277 + "movdqu " MEMACCESS(0) ",%%xmm0 \n" 1.278 + "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.279 + MEMOPREG(movdqu,0x00,0,3,1,xmm2) // movdqu (%0,%3,1),%%xmm2 1.280 + BUNDLEALIGN 1.281 + MEMOPREG(movdqu,0x10,0,3,1,xmm3) // movdqu 0x10(%0,%3,1),%%xmm3 1.282 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.283 + "pavgb %%xmm2,%%xmm0 \n" 1.284 + "pavgb %%xmm3,%%xmm1 \n" 1.285 + "movdqa %%xmm0,%%xmm2 \n" 1.286 + "psrlw $0x8,%%xmm0 \n" 1.287 + "movdqa %%xmm1,%%xmm3 \n" 1.288 + "psrlw $0x8,%%xmm1 \n" 1.289 + "pand %%xmm5,%%xmm2 \n" 1.290 + "pand %%xmm5,%%xmm3 \n" 1.291 + "pavgw %%xmm2,%%xmm0 \n" 1.292 + "pavgw %%xmm3,%%xmm1 \n" 1.293 + "packuswb %%xmm1,%%xmm0 \n" 1.294 + "movdqu %%xmm0," MEMACCESS(1) " \n" 1.295 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.296 + "sub $0x10,%2 \n" 1.297 + "jg 1b \n" 1.298 + : "+r"(src_ptr), // %0 1.299 + "+r"(dst_ptr), // %1 1.300 + "+r"(dst_width) // %2 1.301 + : "r"((intptr_t)(src_stride)) // %3 1.302 + : "memory", "cc" 1.303 +#if defined(__native_client__) && defined(__x86_64__) 1.304 + , "r14" 1.305 +#endif 1.306 +#if defined(__SSE2__) 1.307 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" 1.308 +#endif 1.309 + ); 1.310 +} 1.311 + 1.312 +void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1.313 + uint8* dst_ptr, int dst_width) { 1.314 + asm volatile ( 1.315 + "pcmpeqb %%xmm5,%%xmm5 \n" 1.316 + "psrld $0x18,%%xmm5 \n" 1.317 + "pslld $0x10,%%xmm5 \n" 1.318 + 1.319 + LABELALIGN 1.320 + "1: \n" 1.321 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.322 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.323 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.324 + "pand %%xmm5,%%xmm0 \n" 1.325 + "pand %%xmm5,%%xmm1 \n" 1.326 + "packuswb %%xmm1,%%xmm0 \n" 1.327 + "psrlw $0x8,%%xmm0 \n" 1.328 + "packuswb %%xmm0,%%xmm0 \n" 1.329 + "movq %%xmm0," MEMACCESS(1) " \n" 1.330 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.331 + "sub $0x8,%2 \n" 1.332 + "jg 1b \n" 1.333 + : "+r"(src_ptr), // %0 1.334 + "+r"(dst_ptr), // %1 1.335 + "+r"(dst_width) // %2 1.336 + : 1.337 + : "memory", "cc" 1.338 +#if defined(__SSE2__) 1.339 + , "xmm0", "xmm1", "xmm5" 1.340 +#endif 1.341 + ); 1.342 +} 1.343 + 1.344 +void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1.345 + uint8* dst_ptr, int dst_width) { 1.346 + intptr_t stridex3 = 0; 1.347 + asm volatile ( 1.348 + "pcmpeqb %%xmm7,%%xmm7 \n" 1.349 + "psrlw $0x8,%%xmm7 \n" 1.350 + "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" 1.351 + 1.352 + LABELALIGN 1.353 + "1: \n" 1.354 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.355 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.356 + MEMOPREG(movdqa,0x00,0,4,1,xmm2) // movdqa (%0,%4,1),%%xmm2 1.357 + BUNDLEALIGN 1.358 + MEMOPREG(movdqa,0x10,0,4,1,xmm3) // movdqa 0x10(%0,%4,1),%%xmm3 1.359 + "pavgb %%xmm2,%%xmm0 \n" 1.360 + "pavgb %%xmm3,%%xmm1 \n" 1.361 + MEMOPREG(movdqa,0x00,0,4,2,xmm2) // movdqa (%0,%4,2),%%xmm2 1.362 + BUNDLEALIGN 1.363 + MEMOPREG(movdqa,0x10,0,4,2,xmm3) // movdqa 0x10(%0,%4,2),%%xmm3 1.364 + MEMOPREG(movdqa,0x00,0,3,1,xmm4) // movdqa (%0,%3,1),%%xmm4 1.365 + MEMOPREG(movdqa,0x10,0,3,1,xmm5) // movdqa 0x10(%0,%3,1),%%xmm5 1.366 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.367 + "pavgb %%xmm4,%%xmm2 \n" 1.368 + "pavgb %%xmm2,%%xmm0 \n" 1.369 + "pavgb %%xmm5,%%xmm3 \n" 1.370 + "pavgb %%xmm3,%%xmm1 \n" 1.371 + "movdqa %%xmm0,%%xmm2 \n" 1.372 + "psrlw $0x8,%%xmm0 \n" 1.373 + "movdqa %%xmm1,%%xmm3 \n" 1.374 + "psrlw $0x8,%%xmm1 \n" 1.375 + "pand %%xmm7,%%xmm2 \n" 1.376 + "pand %%xmm7,%%xmm3 \n" 1.377 + "pavgw %%xmm2,%%xmm0 \n" 1.378 + "pavgw %%xmm3,%%xmm1 \n" 1.379 + "packuswb %%xmm1,%%xmm0 \n" 1.380 + "movdqa %%xmm0,%%xmm2 \n" 1.381 + "psrlw $0x8,%%xmm0 \n" 1.382 + "pand %%xmm7,%%xmm2 \n" 1.383 + "pavgw %%xmm2,%%xmm0 \n" 1.384 + "packuswb %%xmm0,%%xmm0 \n" 1.385 + "movq %%xmm0," MEMACCESS(1) " \n" 1.386 + "lea " MEMLEA(0x8,1) ",%1 \n" 1.387 + "sub $0x8,%2 \n" 1.388 + "jg 1b \n" 1.389 + : "+r"(src_ptr), // %0 1.390 + "+r"(dst_ptr), // %1 1.391 + "+r"(dst_width), // %2 1.392 + "+r"(stridex3) // %3 1.393 + : "r"((intptr_t)(src_stride)) // %4 1.394 + : "memory", "cc" 1.395 +#if defined(__native_client__) && defined(__x86_64__) 1.396 + , "r14" 1.397 +#endif 1.398 +#if defined(__SSE2__) 1.399 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm7" 1.400 +#endif 1.401 + ); 1.402 +} 1.403 + 1.404 +void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 1.405 + uint8* dst_ptr, int dst_width) { 1.406 + asm volatile ( 1.407 + "movdqa %0,%%xmm3 \n" 1.408 + "movdqa %1,%%xmm4 \n" 1.409 + "movdqa %2,%%xmm5 \n" 1.410 + : 1.411 + : "m"(kShuf0), // %0 1.412 + "m"(kShuf1), // %1 1.413 + "m"(kShuf2) // %2 1.414 + ); 1.415 + asm volatile ( 1.416 + LABELALIGN 1.417 + "1: \n" 1.418 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.419 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm2 \n" 1.420 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.421 + "movdqa %%xmm2,%%xmm1 \n" 1.422 + "palignr $0x8,%%xmm0,%%xmm1 \n" 1.423 + "pshufb %%xmm3,%%xmm0 \n" 1.424 + "pshufb %%xmm4,%%xmm1 \n" 1.425 + "pshufb %%xmm5,%%xmm2 \n" 1.426 + "movq %%xmm0," MEMACCESS(1) " \n" 1.427 + "movq %%xmm1," MEMACCESS2(0x8,1) " \n" 1.428 + "movq %%xmm2," MEMACCESS2(0x10,1) " \n" 1.429 + "lea " MEMLEA(0x18,1) ",%1 \n" 1.430 + "sub $0x18,%2 \n" 1.431 + "jg 1b \n" 1.432 + : "+r"(src_ptr), // %0 1.433 + "+r"(dst_ptr), // %1 1.434 + "+r"(dst_width) // %2 1.435 + : 1.436 + : "memory", "cc" 1.437 +#if defined(__SSE2__) 1.438 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" 1.439 +#endif 1.440 + ); 1.441 +} 1.442 + 1.443 +void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, 1.444 + ptrdiff_t src_stride, 1.445 + uint8* dst_ptr, int dst_width) { 1.446 + asm volatile ( 1.447 + "movdqa %0,%%xmm2 \n" // kShuf01 1.448 + "movdqa %1,%%xmm3 \n" // kShuf11 1.449 + "movdqa %2,%%xmm4 \n" // kShuf21 1.450 + : 1.451 + : "m"(kShuf01), // %0 1.452 + "m"(kShuf11), // %1 1.453 + "m"(kShuf21) // %2 1.454 + ); 1.455 + asm volatile ( 1.456 + "movdqa %0,%%xmm5 \n" // kMadd01 1.457 + "movdqa %1,%%xmm0 \n" // kMadd11 1.458 + "movdqa %2,%%xmm1 \n" // kRound34 1.459 + : 1.460 + : "m"(kMadd01), // %0 1.461 + "m"(kMadd11), // %1 1.462 + "m"(kRound34) // %2 1.463 + ); 1.464 + asm volatile ( 1.465 + LABELALIGN 1.466 + "1: \n" 1.467 + "movdqa " MEMACCESS(0) ",%%xmm6 \n" 1.468 + MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3),%%xmm7 1.469 + "pavgb %%xmm7,%%xmm6 \n" 1.470 + "pshufb %%xmm2,%%xmm6 \n" 1.471 + "pmaddubsw %%xmm5,%%xmm6 \n" 1.472 + "paddsw %%xmm1,%%xmm6 \n" 1.473 + "psrlw $0x2,%%xmm6 \n" 1.474 + "packuswb %%xmm6,%%xmm6 \n" 1.475 + "movq %%xmm6," MEMACCESS(1) " \n" 1.476 + "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" 1.477 + MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3),%%xmm7 1.478 + "pavgb %%xmm7,%%xmm6 \n" 1.479 + "pshufb %%xmm3,%%xmm6 \n" 1.480 + "pmaddubsw %%xmm0,%%xmm6 \n" 1.481 + "paddsw %%xmm1,%%xmm6 \n" 1.482 + "psrlw $0x2,%%xmm6 \n" 1.483 + "packuswb %%xmm6,%%xmm6 \n" 1.484 + "movq %%xmm6," MEMACCESS2(0x8,1) " \n" 1.485 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" 1.486 + BUNDLEALIGN 1.487 + MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3),%%xmm7 1.488 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.489 + "pavgb %%xmm7,%%xmm6 \n" 1.490 + "pshufb %%xmm4,%%xmm6 \n" 1.491 + "pmaddubsw %4,%%xmm6 \n" 1.492 + "paddsw %%xmm1,%%xmm6 \n" 1.493 + "psrlw $0x2,%%xmm6 \n" 1.494 + "packuswb %%xmm6,%%xmm6 \n" 1.495 + "movq %%xmm6," MEMACCESS2(0x10,1) " \n" 1.496 + "lea " MEMLEA(0x18,1) ",%1 \n" 1.497 + "sub $0x18,%2 \n" 1.498 + "jg 1b \n" 1.499 + : "+r"(src_ptr), // %0 1.500 + "+r"(dst_ptr), // %1 1.501 + "+r"(dst_width) // %2 1.502 + : "r"((intptr_t)(src_stride)), // %3 1.503 + "m"(kMadd21) // %4 1.504 + : "memory", "cc" 1.505 +#if defined(__native_client__) && defined(__x86_64__) 1.506 + , "r14" 1.507 +#endif 1.508 +#if defined(__SSE2__) 1.509 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1.510 +#endif 1.511 + ); 1.512 +} 1.513 + 1.514 +void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, 1.515 + ptrdiff_t src_stride, 1.516 + uint8* dst_ptr, int dst_width) { 1.517 + asm volatile ( 1.518 + "movdqa %0,%%xmm2 \n" // kShuf01 1.519 + "movdqa %1,%%xmm3 \n" // kShuf11 1.520 + "movdqa %2,%%xmm4 \n" // kShuf21 1.521 + : 1.522 + : "m"(kShuf01), // %0 1.523 + "m"(kShuf11), // %1 1.524 + "m"(kShuf21) // %2 1.525 + ); 1.526 + asm volatile ( 1.527 + "movdqa %0,%%xmm5 \n" // kMadd01 1.528 + "movdqa %1,%%xmm0 \n" // kMadd11 1.529 + "movdqa %2,%%xmm1 \n" // kRound34 1.530 + : 1.531 + : "m"(kMadd01), // %0 1.532 + "m"(kMadd11), // %1 1.533 + "m"(kRound34) // %2 1.534 + ); 1.535 + 1.536 + asm volatile ( 1.537 + LABELALIGN 1.538 + "1: \n" 1.539 + "movdqa " MEMACCESS(0) ",%%xmm6 \n" 1.540 + MEMOPREG(movdqa,0x00,0,3,1,xmm7) // movdqa (%0,%3,1),%%xmm7 1.541 + "pavgb %%xmm6,%%xmm7 \n" 1.542 + "pavgb %%xmm7,%%xmm6 \n" 1.543 + "pshufb %%xmm2,%%xmm6 \n" 1.544 + "pmaddubsw %%xmm5,%%xmm6 \n" 1.545 + "paddsw %%xmm1,%%xmm6 \n" 1.546 + "psrlw $0x2,%%xmm6 \n" 1.547 + "packuswb %%xmm6,%%xmm6 \n" 1.548 + "movq %%xmm6," MEMACCESS(1) " \n" 1.549 + "movdqu " MEMACCESS2(0x8,0) ",%%xmm6 \n" 1.550 + MEMOPREG(movdqu,0x8,0,3,1,xmm7) // movdqu 0x8(%0,%3,1),%%xmm7 1.551 + "pavgb %%xmm6,%%xmm7 \n" 1.552 + "pavgb %%xmm7,%%xmm6 \n" 1.553 + "pshufb %%xmm3,%%xmm6 \n" 1.554 + "pmaddubsw %%xmm0,%%xmm6 \n" 1.555 + "paddsw %%xmm1,%%xmm6 \n" 1.556 + "psrlw $0x2,%%xmm6 \n" 1.557 + "packuswb %%xmm6,%%xmm6 \n" 1.558 + "movq %%xmm6," MEMACCESS2(0x8,1) " \n" 1.559 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm6 \n" 1.560 + MEMOPREG(movdqa,0x10,0,3,1,xmm7) // movdqa 0x10(%0,%3,1),%%xmm7 1.561 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.562 + "pavgb %%xmm6,%%xmm7 \n" 1.563 + "pavgb %%xmm7,%%xmm6 \n" 1.564 + "pshufb %%xmm4,%%xmm6 \n" 1.565 + "pmaddubsw %4,%%xmm6 \n" 1.566 + "paddsw %%xmm1,%%xmm6 \n" 1.567 + "psrlw $0x2,%%xmm6 \n" 1.568 + "packuswb %%xmm6,%%xmm6 \n" 1.569 + "movq %%xmm6," MEMACCESS2(0x10,1) " \n" 1.570 + "lea " MEMLEA(0x18,1) ",%1 \n" 1.571 + "sub $0x18,%2 \n" 1.572 + "jg 1b \n" 1.573 + : "+r"(src_ptr), // %0 1.574 + "+r"(dst_ptr), // %1 1.575 + "+r"(dst_width) // %2 1.576 + : "r"((intptr_t)(src_stride)), // %3 1.577 + "m"(kMadd21) // %4 1.578 + : "memory", "cc" 1.579 +#if defined(__native_client__) && defined(__x86_64__) 1.580 + , "r14" 1.581 +#endif 1.582 +#if defined(__SSE2__) 1.583 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1.584 +#endif 1.585 + ); 1.586 +} 1.587 + 1.588 +void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, 1.589 + uint8* dst_ptr, int dst_width) { 1.590 + asm volatile ( 1.591 + "movdqa %3,%%xmm4 \n" 1.592 + "movdqa %4,%%xmm5 \n" 1.593 + 1.594 + LABELALIGN 1.595 + "1: \n" 1.596 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.597 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.598 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.599 + "pshufb %%xmm4,%%xmm0 \n" 1.600 + "pshufb %%xmm5,%%xmm1 \n" 1.601 + "paddusb %%xmm1,%%xmm0 \n" 1.602 + "movq %%xmm0," MEMACCESS(1) " \n" 1.603 + "movhlps %%xmm0,%%xmm1 \n" 1.604 + "movd %%xmm1," MEMACCESS2(0x8,1) " \n" 1.605 + "lea " MEMLEA(0xc,1) ",%1 \n" 1.606 + "sub $0xc,%2 \n" 1.607 + "jg 1b \n" 1.608 + : "+r"(src_ptr), // %0 1.609 + "+r"(dst_ptr), // %1 1.610 + "+r"(dst_width) // %2 1.611 + : "m"(kShuf38a), // %3 1.612 + "m"(kShuf38b) // %4 1.613 + : "memory", "cc" 1.614 +#if defined(__SSE2__) 1.615 + , "xmm0", "xmm1", "xmm4", "xmm5" 1.616 +#endif 1.617 + ); 1.618 +} 1.619 + 1.620 +void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, 1.621 + ptrdiff_t src_stride, 1.622 + uint8* dst_ptr, int dst_width) { 1.623 + asm volatile ( 1.624 + "movdqa %0,%%xmm2 \n" 1.625 + "movdqa %1,%%xmm3 \n" 1.626 + "movdqa %2,%%xmm4 \n" 1.627 + "movdqa %3,%%xmm5 \n" 1.628 + : 1.629 + : "m"(kShufAb0), // %0 1.630 + "m"(kShufAb1), // %1 1.631 + "m"(kShufAb2), // %2 1.632 + "m"(kScaleAb2) // %3 1.633 + ); 1.634 + asm volatile ( 1.635 + LABELALIGN 1.636 + "1: \n" 1.637 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.638 + MEMOPREG(pavgb,0x00,0,3,1,xmm0) // pavgb (%0,%3,1),%%xmm0 1.639 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.640 + "movdqa %%xmm0,%%xmm1 \n" 1.641 + "pshufb %%xmm2,%%xmm1 \n" 1.642 + "movdqa %%xmm0,%%xmm6 \n" 1.643 + "pshufb %%xmm3,%%xmm6 \n" 1.644 + "paddusw %%xmm6,%%xmm1 \n" 1.645 + "pshufb %%xmm4,%%xmm0 \n" 1.646 + "paddusw %%xmm0,%%xmm1 \n" 1.647 + "pmulhuw %%xmm5,%%xmm1 \n" 1.648 + "packuswb %%xmm1,%%xmm1 \n" 1.649 + "sub $0x6,%2 \n" 1.650 + "movd %%xmm1," MEMACCESS(1) " \n" 1.651 + "psrlq $0x10,%%xmm1 \n" 1.652 + "movd %%xmm1," MEMACCESS2(0x2,1) " \n" 1.653 + "lea " MEMLEA(0x6,1) ",%1 \n" 1.654 + "jg 1b \n" 1.655 + : "+r"(src_ptr), // %0 1.656 + "+r"(dst_ptr), // %1 1.657 + "+r"(dst_width) // %2 1.658 + : "r"((intptr_t)(src_stride)) // %3 1.659 + : "memory", "cc" 1.660 +#if defined(__native_client__) && defined(__x86_64__) 1.661 + , "r14" 1.662 +#endif 1.663 +#if defined(__SSE2__) 1.664 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 1.665 +#endif 1.666 + ); 1.667 +} 1.668 + 1.669 +void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, 1.670 + ptrdiff_t src_stride, 1.671 + uint8* dst_ptr, int dst_width) { 1.672 + asm volatile ( 1.673 + "movdqa %0,%%xmm2 \n" 1.674 + "movdqa %1,%%xmm3 \n" 1.675 + "movdqa %2,%%xmm4 \n" 1.676 + "pxor %%xmm5,%%xmm5 \n" 1.677 + : 1.678 + : "m"(kShufAc), // %0 1.679 + "m"(kShufAc3), // %1 1.680 + "m"(kScaleAc33) // %2 1.681 + ); 1.682 + asm volatile ( 1.683 + LABELALIGN 1.684 + "1: \n" 1.685 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.686 + MEMOPREG(movdqa,0x00,0,3,1,xmm6) // movdqa (%0,%3,1),%%xmm6 1.687 + "movhlps %%xmm0,%%xmm1 \n" 1.688 + "movhlps %%xmm6,%%xmm7 \n" 1.689 + "punpcklbw %%xmm5,%%xmm0 \n" 1.690 + "punpcklbw %%xmm5,%%xmm1 \n" 1.691 + "punpcklbw %%xmm5,%%xmm6 \n" 1.692 + "punpcklbw %%xmm5,%%xmm7 \n" 1.693 + "paddusw %%xmm6,%%xmm0 \n" 1.694 + "paddusw %%xmm7,%%xmm1 \n" 1.695 + MEMOPREG(movdqa,0x00,0,3,2,xmm6) // movdqa (%0,%3,2),%%xmm6 1.696 + "lea " MEMLEA(0x10,0) ",%0 \n" 1.697 + "movhlps %%xmm6,%%xmm7 \n" 1.698 + "punpcklbw %%xmm5,%%xmm6 \n" 1.699 + "punpcklbw %%xmm5,%%xmm7 \n" 1.700 + "paddusw %%xmm6,%%xmm0 \n" 1.701 + "paddusw %%xmm7,%%xmm1 \n" 1.702 + "movdqa %%xmm0,%%xmm6 \n" 1.703 + "psrldq $0x2,%%xmm0 \n" 1.704 + "paddusw %%xmm0,%%xmm6 \n" 1.705 + "psrldq $0x2,%%xmm0 \n" 1.706 + "paddusw %%xmm0,%%xmm6 \n" 1.707 + "pshufb %%xmm2,%%xmm6 \n" 1.708 + "movdqa %%xmm1,%%xmm7 \n" 1.709 + "psrldq $0x2,%%xmm1 \n" 1.710 + "paddusw %%xmm1,%%xmm7 \n" 1.711 + "psrldq $0x2,%%xmm1 \n" 1.712 + "paddusw %%xmm1,%%xmm7 \n" 1.713 + "pshufb %%xmm3,%%xmm7 \n" 1.714 + "paddusw %%xmm7,%%xmm6 \n" 1.715 + "pmulhuw %%xmm4,%%xmm6 \n" 1.716 + "packuswb %%xmm6,%%xmm6 \n" 1.717 + "sub $0x6,%2 \n" 1.718 + "movd %%xmm6," MEMACCESS(1) " \n" 1.719 + "psrlq $0x10,%%xmm6 \n" 1.720 + "movd %%xmm6," MEMACCESS2(0x2,1) " \n" 1.721 + "lea " MEMLEA(0x6,1) ",%1 \n" 1.722 + "jg 1b \n" 1.723 + : "+r"(src_ptr), // %0 1.724 + "+r"(dst_ptr), // %1 1.725 + "+r"(dst_width) // %2 1.726 + : "r"((intptr_t)(src_stride)) // %3 1.727 + : "memory", "cc" 1.728 +#if defined(__native_client__) && defined(__x86_64__) 1.729 + , "r14" 1.730 +#endif 1.731 +#if defined(__SSE2__) 1.732 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" 1.733 +#endif 1.734 + ); 1.735 +} 1.736 + 1.737 +void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, 1.738 + uint16* dst_ptr, int src_width, int src_height) { 1.739 + int tmp_height = 0; 1.740 + intptr_t tmp_src = 0; 1.741 + asm volatile ( 1.742 + "pxor %%xmm4,%%xmm4 \n" 1.743 + "sub $0x1,%5 \n" 1.744 + 1.745 + LABELALIGN 1.746 + "1: \n" 1.747 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.748 + "mov %0,%3 \n" 1.749 + "add %6,%0 \n" 1.750 + "movdqa %%xmm0,%%xmm1 \n" 1.751 + "punpcklbw %%xmm4,%%xmm0 \n" 1.752 + "punpckhbw %%xmm4,%%xmm1 \n" 1.753 + "mov %5,%2 \n" 1.754 + "test %2,%2 \n" 1.755 + "je 3f \n" 1.756 + 1.757 + LABELALIGN 1.758 + "2: \n" 1.759 + "movdqa " MEMACCESS(0) ",%%xmm2 \n" 1.760 + "add %6,%0 \n" 1.761 + "movdqa %%xmm2,%%xmm3 \n" 1.762 + "punpcklbw %%xmm4,%%xmm2 \n" 1.763 + "punpckhbw %%xmm4,%%xmm3 \n" 1.764 + "paddusw %%xmm2,%%xmm0 \n" 1.765 + "paddusw %%xmm3,%%xmm1 \n" 1.766 + "sub $0x1,%2 \n" 1.767 + "jg 2b \n" 1.768 + 1.769 + LABELALIGN 1.770 + "3: \n" 1.771 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.772 + "movdqa %%xmm1," MEMACCESS2(0x10,1) " \n" 1.773 + "lea " MEMLEA(0x10,3) ",%0 \n" 1.774 + "lea " MEMLEA(0x20,1) ",%1 \n" 1.775 + "sub $0x10,%4 \n" 1.776 + "jg 1b \n" 1.777 + : "+r"(src_ptr), // %0 1.778 + "+r"(dst_ptr), // %1 1.779 + "+r"(tmp_height), // %2 1.780 + "+r"(tmp_src), // %3 1.781 + "+r"(src_width), // %4 1.782 + "+rm"(src_height) // %5 1.783 + : "rm"((intptr_t)(src_stride)) // %6 1.784 + : "memory", "cc" 1.785 +#if defined(__SSE2__) 1.786 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 1.787 +#endif 1.788 + ); 1.789 +} 1.790 + 1.791 +// Bilinear column filtering. SSSE3 version. 1.792 +void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 1.793 + int dst_width, int x, int dx) { 1.794 + intptr_t x0 = 0, x1 = 0, temp_pixel = 0; 1.795 + asm volatile ( 1.796 + "movd %6,%%xmm2 \n" 1.797 + "movd %7,%%xmm3 \n" 1.798 + "movl $0x04040000,%k2 \n" 1.799 + "movd %k2,%%xmm5 \n" 1.800 + "pcmpeqb %%xmm6,%%xmm6 \n" 1.801 + "psrlw $0x9,%%xmm6 \n" 1.802 + "pextrw $0x1,%%xmm2,%k3 \n" 1.803 + "subl $0x2,%5 \n" 1.804 + "jl 29f \n" 1.805 + "movdqa %%xmm2,%%xmm0 \n" 1.806 + "paddd %%xmm3,%%xmm0 \n" 1.807 + "punpckldq %%xmm0,%%xmm2 \n" 1.808 + "punpckldq %%xmm3,%%xmm3 \n" 1.809 + "paddd %%xmm3,%%xmm3 \n" 1.810 + "pextrw $0x3,%%xmm2,%k4 \n" 1.811 + 1.812 + LABELALIGN 1.813 + "2: \n" 1.814 + "movdqa %%xmm2,%%xmm1 \n" 1.815 + "paddd %%xmm3,%%xmm2 \n" 1.816 + MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 1.817 + "movd %k2,%%xmm0 \n" 1.818 + "psrlw $0x9,%%xmm1 \n" 1.819 + BUNDLEALIGN 1.820 + MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2 1.821 + "movd %k2,%%xmm4 \n" 1.822 + "pshufb %%xmm5,%%xmm1 \n" 1.823 + "punpcklwd %%xmm4,%%xmm0 \n" 1.824 + "pxor %%xmm6,%%xmm1 \n" 1.825 + "pmaddubsw %%xmm1,%%xmm0 \n" 1.826 + "pextrw $0x1,%%xmm2,%k3 \n" 1.827 + "pextrw $0x3,%%xmm2,%k4 \n" 1.828 + "psrlw $0x7,%%xmm0 \n" 1.829 + "packuswb %%xmm0,%%xmm0 \n" 1.830 + "movd %%xmm0,%k2 \n" 1.831 + "mov %w2," MEMACCESS(0) " \n" 1.832 + "lea " MEMLEA(0x2,0) ",%0 \n" 1.833 + "sub $0x2,%5 \n" 1.834 + "jge 2b \n" 1.835 + 1.836 + LABELALIGN 1.837 + "29: \n" 1.838 + "addl $0x1,%5 \n" 1.839 + "jl 99f \n" 1.840 + MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 1.841 + "movd %k2,%%xmm0 \n" 1.842 + "psrlw $0x9,%%xmm2 \n" 1.843 + "pshufb %%xmm5,%%xmm2 \n" 1.844 + "pxor %%xmm6,%%xmm2 \n" 1.845 + "pmaddubsw %%xmm2,%%xmm0 \n" 1.846 + "psrlw $0x7,%%xmm0 \n" 1.847 + "packuswb %%xmm0,%%xmm0 \n" 1.848 + "movd %%xmm0,%k2 \n" 1.849 + "mov %b2," MEMACCESS(0) " \n" 1.850 + "99: \n" 1.851 + : "+r"(dst_ptr), // %0 1.852 + "+r"(src_ptr), // %1 1.853 + "+a"(temp_pixel), // %2 1.854 + "+r"(x0), // %3 1.855 + "+r"(x1), // %4 1.856 + "+rm"(dst_width) // %5 1.857 + : "rm"(x), // %6 1.858 + "rm"(dx) // %7 1.859 + : "memory", "cc" 1.860 +#if defined(__native_client__) && defined(__x86_64__) 1.861 + , "r14" 1.862 +#endif 1.863 +#if defined(__SSE2__) 1.864 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 1.865 +#endif 1.866 + ); 1.867 +} 1.868 + 1.869 +// Reads 4 pixels, duplicates them and writes 8 pixels. 1.870 +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 1.871 +void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, 1.872 + int dst_width, int x, int dx) { 1.873 + asm volatile ( 1.874 + LABELALIGN 1.875 + "1: \n" 1.876 + "movdqa " MEMACCESS(1) ",%%xmm0 \n" 1.877 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.878 + "movdqa %%xmm0,%%xmm1 \n" 1.879 + "punpcklbw %%xmm0,%%xmm0 \n" 1.880 + "punpckhbw %%xmm1,%%xmm1 \n" 1.881 + "sub $0x20,%2 \n" 1.882 + "movdqa %%xmm0," MEMACCESS(0) " \n" 1.883 + "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" 1.884 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.885 + "jg 1b \n" 1.886 + 1.887 + : "+r"(dst_ptr), // %0 1.888 + "+r"(src_ptr), // %1 1.889 + "+r"(dst_width) // %2 1.890 + : 1.891 + : "memory", "cc" 1.892 +#if defined(__SSE2__) 1.893 + , "xmm0", "xmm1" 1.894 +#endif 1.895 + ); 1.896 +} 1.897 + 1.898 +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, 1.899 + ptrdiff_t src_stride, 1.900 + uint8* dst_argb, int dst_width) { 1.901 + asm volatile ( 1.902 + LABELALIGN 1.903 + "1: \n" 1.904 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.905 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.906 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.907 + "shufps $0xdd,%%xmm1,%%xmm0 \n" 1.908 + "sub $0x4,%2 \n" 1.909 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.910 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.911 + "jg 1b \n" 1.912 + : "+r"(src_argb), // %0 1.913 + "+r"(dst_argb), // %1 1.914 + "+r"(dst_width) // %2 1.915 + : 1.916 + : "memory", "cc" 1.917 +#if defined(__SSE2__) 1.918 + , "xmm0", "xmm1" 1.919 +#endif 1.920 + ); 1.921 +} 1.922 + 1.923 +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, 1.924 + ptrdiff_t src_stride, 1.925 + uint8* dst_argb, int dst_width) { 1.926 + asm volatile ( 1.927 + LABELALIGN 1.928 + "1: \n" 1.929 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.930 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.931 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.932 + "movdqa %%xmm0,%%xmm2 \n" 1.933 + "shufps $0x88,%%xmm1,%%xmm0 \n" 1.934 + "shufps $0xdd,%%xmm1,%%xmm2 \n" 1.935 + "pavgb %%xmm2,%%xmm0 \n" 1.936 + "sub $0x4,%2 \n" 1.937 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.938 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.939 + "jg 1b \n" 1.940 + : "+r"(src_argb), // %0 1.941 + "+r"(dst_argb), // %1 1.942 + "+r"(dst_width) // %2 1.943 + : 1.944 + : "memory", "cc" 1.945 +#if defined(__SSE2__) 1.946 + , "xmm0", "xmm1" 1.947 +#endif 1.948 + ); 1.949 +} 1.950 + 1.951 +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, 1.952 + ptrdiff_t src_stride, 1.953 + uint8* dst_argb, int dst_width) { 1.954 + asm volatile ( 1.955 + LABELALIGN 1.956 + "1: \n" 1.957 + "movdqa " MEMACCESS(0) ",%%xmm0 \n" 1.958 + "movdqa " MEMACCESS2(0x10,0) ",%%xmm1 \n" 1.959 + BUNDLEALIGN 1.960 + MEMOPREG(movdqa,0x00,0,3,1,xmm2) // movdqa (%0,%3,1),%%xmm2 1.961 + MEMOPREG(movdqa,0x10,0,3,1,xmm3) // movdqa 0x10(%0,%3,1),%%xmm3 1.962 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.963 + "pavgb %%xmm2,%%xmm0 \n" 1.964 + "pavgb %%xmm3,%%xmm1 \n" 1.965 + "movdqa %%xmm0,%%xmm2 \n" 1.966 + "shufps $0x88,%%xmm1,%%xmm0 \n" 1.967 + "shufps $0xdd,%%xmm1,%%xmm2 \n" 1.968 + "pavgb %%xmm2,%%xmm0 \n" 1.969 + "sub $0x4,%2 \n" 1.970 + "movdqa %%xmm0," MEMACCESS(1) " \n" 1.971 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.972 + "jg 1b \n" 1.973 + : "+r"(src_argb), // %0 1.974 + "+r"(dst_argb), // %1 1.975 + "+r"(dst_width) // %2 1.976 + : "r"((intptr_t)(src_stride)) // %3 1.977 + : "memory", "cc" 1.978 +#if defined(__native_client__) && defined(__x86_64__) 1.979 + , "r14" 1.980 +#endif 1.981 +#if defined(__SSE2__) 1.982 + , "xmm0", "xmm1", "xmm2", "xmm3" 1.983 +#endif 1.984 + ); 1.985 +} 1.986 + 1.987 +// Reads 4 pixels at a time. 1.988 +// Alignment requirement: dst_argb 16 byte aligned. 1.989 +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, 1.990 + int src_stepx, 1.991 + uint8* dst_argb, int dst_width) { 1.992 + intptr_t src_stepx_x4 = (intptr_t)(src_stepx); 1.993 + intptr_t src_stepx_x12 = 0; 1.994 + asm volatile ( 1.995 + "lea " MEMLEA3(0x00,1,4) ",%1 \n" 1.996 + "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" 1.997 + LABELALIGN 1.998 + "1: \n" 1.999 + "movd " MEMACCESS(0) ",%%xmm0 \n" 1.1000 + MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 1.1001 + "punpckldq %%xmm1,%%xmm0 \n" 1.1002 + BUNDLEALIGN 1.1003 + MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 1.1004 + MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 1.1005 + "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" 1.1006 + "punpckldq %%xmm3,%%xmm2 \n" 1.1007 + "punpcklqdq %%xmm2,%%xmm0 \n" 1.1008 + "sub $0x4,%3 \n" 1.1009 + "movdqa %%xmm0," MEMACCESS(2) " \n" 1.1010 + "lea " MEMLEA(0x10,2) ",%2 \n" 1.1011 + "jg 1b \n" 1.1012 + : "+r"(src_argb), // %0 1.1013 + "+r"(src_stepx_x4), // %1 1.1014 + "+r"(dst_argb), // %2 1.1015 + "+r"(dst_width), // %3 1.1016 + "+r"(src_stepx_x12) // %4 1.1017 + : 1.1018 + : "memory", "cc" 1.1019 +#if defined(__native_client__) && defined(__x86_64__) 1.1020 + , "r14" 1.1021 +#endif 1.1022 +#if defined(__SSE2__) 1.1023 + , "xmm0", "xmm1", "xmm2", "xmm3" 1.1024 +#endif 1.1025 + ); 1.1026 +} 1.1027 + 1.1028 +// Blends four 2x2 to 4x1. 1.1029 +// Alignment requirement: dst_argb 16 byte aligned. 1.1030 +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, 1.1031 + ptrdiff_t src_stride, int src_stepx, 1.1032 + uint8* dst_argb, int dst_width) { 1.1033 + intptr_t src_stepx_x4 = (intptr_t)(src_stepx); 1.1034 + intptr_t src_stepx_x12 = 0; 1.1035 + intptr_t row1 = (intptr_t)(src_stride); 1.1036 + asm volatile ( 1.1037 + "lea " MEMLEA3(0x00,1,4) ",%1 \n" 1.1038 + "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" 1.1039 + "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" 1.1040 + 1.1041 + LABELALIGN 1.1042 + "1: \n" 1.1043 + "movq " MEMACCESS(0) ",%%xmm0 \n" 1.1044 + MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 1.1045 + MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 1.1046 + BUNDLEALIGN 1.1047 + MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 1.1048 + "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" 1.1049 + "movq " MEMACCESS(5) ",%%xmm2 \n" 1.1050 + BUNDLEALIGN 1.1051 + MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 1.1052 + MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 1.1053 + MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 1.1054 + "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" 1.1055 + "pavgb %%xmm2,%%xmm0 \n" 1.1056 + "pavgb %%xmm3,%%xmm1 \n" 1.1057 + "movdqa %%xmm0,%%xmm2 \n" 1.1058 + "shufps $0x88,%%xmm1,%%xmm0 \n" 1.1059 + "shufps $0xdd,%%xmm1,%%xmm2 \n" 1.1060 + "pavgb %%xmm2,%%xmm0 \n" 1.1061 + "sub $0x4,%3 \n" 1.1062 + "movdqa %%xmm0," MEMACCESS(2) " \n" 1.1063 + "lea " MEMLEA(0x10,2) ",%2 \n" 1.1064 + "jg 1b \n" 1.1065 + : "+r"(src_argb), // %0 1.1066 + "+r"(src_stepx_x4), // %1 1.1067 + "+r"(dst_argb), // %2 1.1068 + "+rm"(dst_width), // %3 1.1069 + "+r"(src_stepx_x12), // %4 1.1070 + "+r"(row1) // %5 1.1071 + : 1.1072 + : "memory", "cc" 1.1073 +#if defined(__native_client__) && defined(__x86_64__) 1.1074 + , "r14" 1.1075 +#endif 1.1076 +#if defined(__SSE2__) 1.1077 + , "xmm0", "xmm1", "xmm2", "xmm3" 1.1078 +#endif 1.1079 + ); 1.1080 +} 1.1081 + 1.1082 +void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, 1.1083 + int dst_width, int x, int dx) { 1.1084 + intptr_t x0 = 0, x1 = 0; 1.1085 + asm volatile ( 1.1086 + "movd %5,%%xmm2 \n" 1.1087 + "movd %6,%%xmm3 \n" 1.1088 + "pshufd $0x0,%%xmm2,%%xmm2 \n" 1.1089 + "pshufd $0x11,%%xmm3,%%xmm0 \n" 1.1090 + "paddd %%xmm0,%%xmm2 \n" 1.1091 + "paddd %%xmm3,%%xmm3 \n" 1.1092 + "pshufd $0x5,%%xmm3,%%xmm0 \n" 1.1093 + "paddd %%xmm0,%%xmm2 \n" 1.1094 + "paddd %%xmm3,%%xmm3 \n" 1.1095 + "pshufd $0x0,%%xmm3,%%xmm3 \n" 1.1096 + "pextrw $0x1,%%xmm2,%k0 \n" 1.1097 + "pextrw $0x3,%%xmm2,%k1 \n" 1.1098 + "cmp $0x0,%4 \n" 1.1099 + "jl 99f \n" 1.1100 + "sub $0x4,%4 \n" 1.1101 + "jl 49f \n" 1.1102 + 1.1103 + LABELALIGN 1.1104 + "40: \n" 1.1105 + MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 1.1106 + MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 1.1107 + "pextrw $0x5,%%xmm2,%k0 \n" 1.1108 + "pextrw $0x7,%%xmm2,%k1 \n" 1.1109 + "paddd %%xmm3,%%xmm2 \n" 1.1110 + "punpckldq %%xmm1,%%xmm0 \n" 1.1111 + MEMOPREG(movd,0x00,3,0,4,xmm1) // movd (%3,%0,4),%%xmm1 1.1112 + MEMOPREG(movd,0x00,3,1,4,xmm4) // movd (%3,%1,4),%%xmm4 1.1113 + "pextrw $0x1,%%xmm2,%k0 \n" 1.1114 + "pextrw $0x3,%%xmm2,%k1 \n" 1.1115 + "punpckldq %%xmm4,%%xmm1 \n" 1.1116 + "punpcklqdq %%xmm1,%%xmm0 \n" 1.1117 + "sub $0x4,%4 \n" 1.1118 + "movdqu %%xmm0," MEMACCESS(2) " \n" 1.1119 + "lea " MEMLEA(0x10,2) ",%2 \n" 1.1120 + "jge 40b \n" 1.1121 + 1.1122 + "49: \n" 1.1123 + "test $0x2,%4 \n" 1.1124 + "je 29f \n" 1.1125 + BUNDLEALIGN 1.1126 + MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 1.1127 + MEMOPREG(movd,0x00,3,1,4,xmm1) // movd (%3,%1,4),%%xmm1 1.1128 + "pextrw $0x5,%%xmm2,%k0 \n" 1.1129 + "punpckldq %%xmm1,%%xmm0 \n" 1.1130 + "movq %%xmm0," MEMACCESS(2) " \n" 1.1131 + "lea " MEMLEA(0x8,2) ",%2 \n" 1.1132 + "29: \n" 1.1133 + "test $0x1,%4 \n" 1.1134 + "je 99f \n" 1.1135 + MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 1.1136 + "movd %%xmm0," MEMACCESS(2) " \n" 1.1137 + "99: \n" 1.1138 + : "+a"(x0), // %0 1.1139 + "+d"(x1), // %1 1.1140 + "+r"(dst_argb), // %2 1.1141 + "+r"(src_argb), // %3 1.1142 + "+r"(dst_width) // %4 1.1143 + : "rm"(x), // %5 1.1144 + "rm"(dx) // %6 1.1145 + : "memory", "cc" 1.1146 +#if defined(__native_client__) && defined(__x86_64__) 1.1147 + , "r14" 1.1148 +#endif 1.1149 +#if defined(__SSE2__) 1.1150 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" 1.1151 +#endif 1.1152 + ); 1.1153 +} 1.1154 + 1.1155 +// Reads 4 pixels, duplicates them and writes 8 pixels. 1.1156 +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. 1.1157 +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, 1.1158 + int dst_width, int x, int dx) { 1.1159 + asm volatile ( 1.1160 + LABELALIGN 1.1161 + "1: \n" 1.1162 + "movdqa " MEMACCESS(1) ",%%xmm0 \n" 1.1163 + "lea " MEMLEA(0x10,1) ",%1 \n" 1.1164 + "movdqa %%xmm0,%%xmm1 \n" 1.1165 + "punpckldq %%xmm0,%%xmm0 \n" 1.1166 + "punpckhdq %%xmm1,%%xmm1 \n" 1.1167 + "sub $0x8,%2 \n" 1.1168 + "movdqa %%xmm0," MEMACCESS(0) " \n" 1.1169 + "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" 1.1170 + "lea " MEMLEA(0x20,0) ",%0 \n" 1.1171 + "jg 1b \n" 1.1172 + 1.1173 + : "+r"(dst_argb), // %0 1.1174 + "+r"(src_argb), // %1 1.1175 + "+r"(dst_width) // %2 1.1176 + : 1.1177 + : "memory", "cc" 1.1178 +#if defined(__native_client__) && defined(__x86_64__) 1.1179 + , "r14" 1.1180 +#endif 1.1181 +#if defined(__SSE2__) 1.1182 + , "xmm0", "xmm1" 1.1183 +#endif 1.1184 + ); 1.1185 +} 1.1186 + 1.1187 +// Shuffle table for arranging 2 pixels into pairs for pmaddubsw 1.1188 +static uvec8 kShuffleColARGB = { 1.1189 + 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel 1.1190 + 8u, 12u, 9u, 13u, 10u, 14u, 11u, 15u // bbggrraa 2nd pixel 1.1191 +}; 1.1192 + 1.1193 +// Shuffle table for duplicating 2 fractions into 8 bytes each 1.1194 +static uvec8 kShuffleFractions = { 1.1195 + 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 1.1196 +}; 1.1197 + 1.1198 +// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version 1.1199 +void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, 1.1200 + int dst_width, int x, int dx) { 1.1201 + intptr_t x0 = 0, x1 = 0; 1.1202 + asm volatile ( 1.1203 + "movdqa %0,%%xmm4 \n" 1.1204 + "movdqa %1,%%xmm5 \n" 1.1205 + : 1.1206 + : "m"(kShuffleColARGB), // %0 1.1207 + "m"(kShuffleFractions) // %1 1.1208 + ); 1.1209 + 1.1210 + asm volatile ( 1.1211 + "movd %5,%%xmm2 \n" 1.1212 + "movd %6,%%xmm3 \n" 1.1213 + "pcmpeqb %%xmm6,%%xmm6 \n" 1.1214 + "psrlw $0x9,%%xmm6 \n" 1.1215 + "pextrw $0x1,%%xmm2,%k3 \n" 1.1216 + "sub $0x2,%2 \n" 1.1217 + "jl 29f \n" 1.1218 + "movdqa %%xmm2,%%xmm0 \n" 1.1219 + "paddd %%xmm3,%%xmm0 \n" 1.1220 + "punpckldq %%xmm0,%%xmm2 \n" 1.1221 + "punpckldq %%xmm3,%%xmm3 \n" 1.1222 + "paddd %%xmm3,%%xmm3 \n" 1.1223 + "pextrw $0x3,%%xmm2,%k4 \n" 1.1224 + 1.1225 + LABELALIGN 1.1226 + "2: \n" 1.1227 + "movdqa %%xmm2,%%xmm1 \n" 1.1228 + "paddd %%xmm3,%%xmm2 \n" 1.1229 + MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 1.1230 + "psrlw $0x9,%%xmm1 \n" 1.1231 + BUNDLEALIGN 1.1232 + MEMOPREG(movhps,0x00,1,4,4,xmm0) // movhps (%1,%4,4),%%xmm0 1.1233 + "pshufb %%xmm5,%%xmm1 \n" 1.1234 + "pshufb %%xmm4,%%xmm0 \n" 1.1235 + "pxor %%xmm6,%%xmm1 \n" 1.1236 + "pmaddubsw %%xmm1,%%xmm0 \n" 1.1237 + "psrlw $0x7,%%xmm0 \n" 1.1238 + "pextrw $0x1,%%xmm2,%k3 \n" 1.1239 + "pextrw $0x3,%%xmm2,%k4 \n" 1.1240 + "packuswb %%xmm0,%%xmm0 \n" 1.1241 + "movq %%xmm0," MEMACCESS(0) " \n" 1.1242 + "lea " MEMLEA(0x8,0) ",%0 \n" 1.1243 + "sub $0x2,%2 \n" 1.1244 + "jge 2b \n" 1.1245 + 1.1246 + LABELALIGN 1.1247 + "29: \n" 1.1248 + "add $0x1,%2 \n" 1.1249 + "jl 99f \n" 1.1250 + "psrlw $0x9,%%xmm2 \n" 1.1251 + BUNDLEALIGN 1.1252 + MEMOPREG(movq,0x00,1,3,4,xmm0) // movq (%1,%3,4),%%xmm0 1.1253 + "pshufb %%xmm5,%%xmm2 \n" 1.1254 + "pshufb %%xmm4,%%xmm0 \n" 1.1255 + "pxor %%xmm6,%%xmm2 \n" 1.1256 + "pmaddubsw %%xmm2,%%xmm0 \n" 1.1257 + "psrlw $0x7,%%xmm0 \n" 1.1258 + "packuswb %%xmm0,%%xmm0 \n" 1.1259 + "movd %%xmm0," MEMACCESS(0) " \n" 1.1260 + 1.1261 + LABELALIGN 1.1262 + "99: \n" 1.1263 + : "+r"(dst_argb), // %0 1.1264 + "+r"(src_argb), // %1 1.1265 + "+rm"(dst_width), // %2 1.1266 + "+r"(x0), // %3 1.1267 + "+r"(x1) // %4 1.1268 + : "rm"(x), // %5 1.1269 + "rm"(dx) // %6 1.1270 + : "memory", "cc" 1.1271 +#if defined(__native_client__) && defined(__x86_64__) 1.1272 + , "r14" 1.1273 +#endif 1.1274 +#if defined(__SSE2__) 1.1275 + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" 1.1276 +#endif 1.1277 + ); 1.1278 +} 1.1279 + 1.1280 +// Divide num by div and return as 16.16 fixed point result. 1.1281 +int FixedDiv_X86(int num, int div) { 1.1282 + asm volatile ( 1.1283 + "cdq \n" 1.1284 + "shld $0x10,%%eax,%%edx \n" 1.1285 + "shl $0x10,%%eax \n" 1.1286 + "idiv %1 \n" 1.1287 + "mov %0, %%eax \n" 1.1288 + : "+a"(num) // %0 1.1289 + : "c"(div) // %1 1.1290 + : "memory", "cc", "edx" 1.1291 + ); 1.1292 + return num; 1.1293 +} 1.1294 + 1.1295 +// Divide num - 1 by div - 1 and return as 16.16 fixed point result. 1.1296 +int FixedDiv1_X86(int num, int div) { 1.1297 + asm volatile ( 1.1298 + "cdq \n" 1.1299 + "shld $0x10,%%eax,%%edx \n" 1.1300 + "shl $0x10,%%eax \n" 1.1301 + "sub $0x10001,%%eax \n" 1.1302 + "sbb $0x0,%%edx \n" 1.1303 + "sub $0x1,%1 \n" 1.1304 + "idiv %1 \n" 1.1305 + "mov %0, %%eax \n" 1.1306 + : "+a"(num) // %0 1.1307 + : "c"(div) // %1 1.1308 + : "memory", "cc", "edx" 1.1309 + ); 1.1310 + return num; 1.1311 +} 1.1312 + 1.1313 +#endif // defined(__x86_64__) || defined(__i386__) 1.1314 + 1.1315 +#ifdef __cplusplus 1.1316 +} // extern "C" 1.1317 +} // namespace libyuv 1.1318 +#endif