1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/common/x86/vp9_asm_stubs.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,468 @@ 1.4 +/* 1.5 + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include <assert.h> 1.15 + 1.16 +#include "./vpx_config.h" 1.17 +#include "./vp9_rtcd.h" 1.18 +#include "vpx_ports/mem.h" 1.19 +/////////////////////////////////////////////////////////////////////////// 1.20 +// the mmx function that does the bilinear filtering and var calculation // 1.21 +// int one pass // 1.22 +/////////////////////////////////////////////////////////////////////////// 1.23 +DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = { 1.24 + { 128, 128, 128, 128, 0, 0, 0, 0 }, 1.25 + { 120, 120, 120, 120, 8, 8, 8, 8 }, 1.26 + { 112, 112, 112, 112, 16, 16, 16, 16 }, 1.27 + { 104, 104, 104, 104, 24, 24, 24, 24 }, 1.28 + { 96, 96, 96, 96, 32, 32, 32, 32 }, 1.29 + { 88, 88, 88, 88, 40, 40, 40, 40 }, 1.30 + { 80, 80, 80, 80, 48, 48, 48, 48 }, 1.31 + { 72, 72, 72, 72, 56, 56, 56, 56 }, 1.32 + { 64, 64, 64, 64, 64, 64, 64, 64 }, 1.33 + { 56, 56, 56, 56, 72, 72, 72, 72 }, 1.34 + { 48, 48, 48, 48, 80, 80, 80, 80 }, 1.35 + { 40, 40, 40, 40, 88, 88, 88, 88 }, 1.36 + { 32, 32, 32, 32, 96, 96, 96, 96 }, 1.37 + { 24, 24, 24, 24, 104, 104, 104, 104 }, 1.38 + { 16, 16, 16, 16, 112, 112, 112, 112 }, 1.39 + { 8, 8, 8, 8, 120, 120, 120, 120 } 1.40 +}; 1.41 + 1.42 +typedef void filter8_1dfunction ( 1.43 + const unsigned char *src_ptr, 1.44 + const unsigned int src_pitch, 1.45 + unsigned char *output_ptr, 1.46 + unsigned int out_pitch, 1.47 + unsigned int output_height, 1.48 + const short *filter 1.49 +); 1.50 + 1.51 +#if HAVE_SSSE3 1.52 +filter8_1dfunction vp9_filter_block1d16_v8_ssse3; 1.53 +filter8_1dfunction vp9_filter_block1d16_h8_ssse3; 1.54 +filter8_1dfunction vp9_filter_block1d8_v8_ssse3; 1.55 +filter8_1dfunction vp9_filter_block1d8_h8_ssse3; 1.56 +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; 1.57 +filter8_1dfunction vp9_filter_block1d4_h8_ssse3; 1.58 +filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; 1.59 +filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; 1.60 +filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; 1.61 +filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; 1.62 +filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; 1.63 +filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; 1.64 + 1.65 +void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, 1.66 + uint8_t *dst, ptrdiff_t dst_stride, 1.67 + const int16_t *filter_x, int x_step_q4, 1.68 + const int16_t *filter_y, int y_step_q4, 1.69 + int w, int h) { 1.70 + /* Ensure the filter can be compressed to int16_t. */ 1.71 + if (x_step_q4 == 16 && filter_x[3] != 128) { 1.72 + while (w >= 16) { 1.73 + vp9_filter_block1d16_h8_ssse3(src, src_stride, 1.74 + dst, dst_stride, 1.75 + h, filter_x); 1.76 + src += 16; 1.77 + dst += 16; 1.78 + w -= 16; 1.79 + } 1.80 + while (w >= 8) { 1.81 + vp9_filter_block1d8_h8_ssse3(src, src_stride, 1.82 + dst, dst_stride, 1.83 + h, filter_x); 1.84 + src += 8; 1.85 + dst += 8; 1.86 + w -= 8; 1.87 + } 1.88 + while (w >= 4) { 1.89 + vp9_filter_block1d4_h8_ssse3(src, src_stride, 1.90 + dst, dst_stride, 1.91 + h, filter_x); 1.92 + src += 4; 1.93 + dst += 4; 1.94 + w -= 4; 1.95 + } 1.96 + } 1.97 + if (w) { 1.98 + vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, 1.99 + filter_x, x_step_q4, filter_y, y_step_q4, 1.100 + w, h); 1.101 + } 1.102 +} 1.103 + 1.104 +void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, 1.105 + uint8_t *dst, ptrdiff_t dst_stride, 1.106 + const int16_t *filter_x, int x_step_q4, 1.107 + const int16_t *filter_y, int y_step_q4, 1.108 + int w, int h) { 1.109 + if (y_step_q4 == 16 && filter_y[3] != 128) { 1.110 + while (w >= 16) { 1.111 + vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride, 1.112 + dst, dst_stride, 1.113 + h, filter_y); 1.114 + src += 16; 1.115 + dst += 16; 1.116 + w -= 16; 1.117 + } 1.118 + while (w >= 8) { 1.119 + vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride, 1.120 + dst, dst_stride, 1.121 + h, filter_y); 1.122 + src += 8; 1.123 + dst += 8; 1.124 + w -= 8; 1.125 + } 1.126 + while (w >= 4) { 1.127 + vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride, 1.128 + dst, dst_stride, 1.129 + h, filter_y); 1.130 + src += 4; 1.131 + dst += 4; 1.132 + w -= 4; 1.133 + } 1.134 + } 1.135 + if (w) { 1.136 + vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, 1.137 + filter_x, x_step_q4, filter_y, y_step_q4, 1.138 + w, h); 1.139 + } 1.140 +} 1.141 + 1.142 +void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, 1.143 + uint8_t *dst, ptrdiff_t dst_stride, 1.144 + const int16_t *filter_x, int x_step_q4, 1.145 + const int16_t *filter_y, int y_step_q4, 1.146 + int w, int h) { 1.147 + if (x_step_q4 == 16 && filter_x[3] != 128) { 1.148 + while (w >= 16) { 1.149 + vp9_filter_block1d16_h8_avg_ssse3(src, src_stride, 1.150 + dst, dst_stride, 1.151 + h, filter_x); 1.152 + src += 16; 1.153 + dst += 16; 1.154 + w -= 16; 1.155 + } 1.156 + while (w >= 8) { 1.157 + vp9_filter_block1d8_h8_avg_ssse3(src, src_stride, 1.158 + dst, dst_stride, 1.159 + h, filter_x); 1.160 + src += 8; 1.161 + dst += 8; 1.162 + w -= 8; 1.163 + } 1.164 + while (w >= 4) { 1.165 + vp9_filter_block1d4_h8_avg_ssse3(src, src_stride, 1.166 + dst, dst_stride, 1.167 + h, filter_x); 1.168 + src += 4; 1.169 + dst += 4; 1.170 + w -= 4; 1.171 + } 1.172 + } 1.173 + if (w) { 1.174 + vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, 1.175 + filter_x, x_step_q4, filter_y, y_step_q4, 1.176 + w, h); 1.177 + } 1.178 +} 1.179 + 1.180 +void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, 1.181 + uint8_t *dst, ptrdiff_t dst_stride, 1.182 + const int16_t *filter_x, int x_step_q4, 1.183 + const int16_t *filter_y, int y_step_q4, 1.184 + int w, int h) { 1.185 + if (y_step_q4 == 16 && filter_y[3] != 128) { 1.186 + while (w >= 16) { 1.187 + vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride, 1.188 + dst, dst_stride, 1.189 + h, filter_y); 1.190 + src += 16; 1.191 + dst += 16; 1.192 + w -= 16; 1.193 + } 1.194 + while (w >= 8) { 1.195 + vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride, 1.196 + dst, dst_stride, 1.197 + h, filter_y); 1.198 + src += 8; 1.199 + dst += 8; 1.200 + w -= 8; 1.201 + } 1.202 + while (w >= 4) { 1.203 + vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride, 1.204 + dst, dst_stride, 1.205 + h, filter_y); 1.206 + src += 4; 1.207 + dst += 4; 1.208 + w -= 4; 1.209 + } 1.210 + } 1.211 + if (w) { 1.212 + vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, 1.213 + filter_x, x_step_q4, filter_y, y_step_q4, 1.214 + w, h); 1.215 + } 1.216 +} 1.217 + 1.218 +void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, 1.219 + uint8_t *dst, ptrdiff_t dst_stride, 1.220 + const int16_t *filter_x, int x_step_q4, 1.221 + const int16_t *filter_y, int y_step_q4, 1.222 + int w, int h) { 1.223 + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); 1.224 + 1.225 + assert(w <= 64); 1.226 + assert(h <= 64); 1.227 + if (x_step_q4 == 16 && y_step_q4 == 16) { 1.228 + vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, 1.229 + filter_x, x_step_q4, filter_y, y_step_q4, 1.230 + w, h + 7); 1.231 + vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, 1.232 + filter_x, x_step_q4, filter_y, y_step_q4, w, h); 1.233 + } else { 1.234 + vp9_convolve8_c(src, src_stride, dst, dst_stride, 1.235 + filter_x, x_step_q4, filter_y, y_step_q4, w, h); 1.236 + } 1.237 +} 1.238 + 1.239 +void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, 1.240 + uint8_t *dst, ptrdiff_t dst_stride, 1.241 + const int16_t *filter_x, int x_step_q4, 1.242 + const int16_t *filter_y, int y_step_q4, 1.243 + int w, int h) { 1.244 + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); 1.245 + 1.246 + assert(w <= 64); 1.247 + assert(h <= 64); 1.248 + if (x_step_q4 == 16 && y_step_q4 == 16) { 1.249 + vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, 1.250 + filter_x, x_step_q4, filter_y, y_step_q4, 1.251 + w, h + 7); 1.252 + vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, 1.253 + filter_x, x_step_q4, filter_y, y_step_q4, 1.254 + w, h); 1.255 + } else { 1.256 + vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, 1.257 + filter_x, x_step_q4, filter_y, y_step_q4, w, h); 1.258 + } 1.259 +} 1.260 +#endif 1.261 + 1.262 +#if HAVE_SSE2 1.263 +filter8_1dfunction vp9_filter_block1d16_v8_sse2; 1.264 +filter8_1dfunction vp9_filter_block1d16_h8_sse2; 1.265 +filter8_1dfunction vp9_filter_block1d8_v8_sse2; 1.266 +filter8_1dfunction vp9_filter_block1d8_h8_sse2; 1.267 +filter8_1dfunction vp9_filter_block1d4_v8_sse2; 1.268 +filter8_1dfunction vp9_filter_block1d4_h8_sse2; 1.269 +filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2; 1.270 +filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2; 1.271 +filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2; 1.272 +filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; 1.273 +filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; 1.274 +filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; 1.275 + 1.276 +void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, 1.277 + uint8_t *dst, ptrdiff_t dst_stride, 1.278 + const int16_t *filter_x, int x_step_q4, 1.279 + const int16_t *filter_y, int y_step_q4, 1.280 + int w, int h) { 1.281 + /* Ensure the filter can be compressed to int16_t. */ 1.282 + if (x_step_q4 == 16 && filter_x[3] != 128) { 1.283 + while (w >= 16) { 1.284 + vp9_filter_block1d16_h8_sse2(src, src_stride, 1.285 + dst, dst_stride, 1.286 + h, filter_x); 1.287 + src += 16; 1.288 + dst += 16; 1.289 + w -= 16; 1.290 + } 1.291 + while (w >= 8) { 1.292 + vp9_filter_block1d8_h8_sse2(src, src_stride, 1.293 + dst, dst_stride, 1.294 + h, filter_x); 1.295 + src += 8; 1.296 + dst += 8; 1.297 + w -= 8; 1.298 + } 1.299 + while (w >= 4) { 1.300 + vp9_filter_block1d4_h8_sse2(src, src_stride, 1.301 + dst, dst_stride, 1.302 + h, filter_x); 1.303 + src += 4; 1.304 + dst += 4; 1.305 + w -= 4; 1.306 + } 1.307 + } 1.308 + if (w) { 1.309 + vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, 1.310 + filter_x, x_step_q4, filter_y, y_step_q4, 1.311 + w, h); 1.312 + } 1.313 +} 1.314 + 1.315 +void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, 1.316 + uint8_t *dst, ptrdiff_t dst_stride, 1.317 + const int16_t *filter_x, int x_step_q4, 1.318 + const int16_t *filter_y, int y_step_q4, 1.319 + int w, int h) { 1.320 + if (y_step_q4 == 16 && filter_y[3] != 128) { 1.321 + while (w >= 16) { 1.322 + vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride, 1.323 + dst, dst_stride, 1.324 + h, filter_y); 1.325 + src += 16; 1.326 + dst += 16; 1.327 + w -= 16; 1.328 + } 1.329 + while (w >= 8) { 1.330 + vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride, 1.331 + dst, dst_stride, 1.332 + h, filter_y); 1.333 + src += 8; 1.334 + dst += 8; 1.335 + w -= 8; 1.336 + } 1.337 + while (w >= 4) { 1.338 + vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride, 1.339 + dst, dst_stride, 1.340 + h, filter_y); 1.341 + src += 4; 1.342 + dst += 4; 1.343 + w -= 4; 1.344 + } 1.345 + } 1.346 + if (w) { 1.347 + vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, 1.348 + filter_x, x_step_q4, filter_y, y_step_q4, 1.349 + w, h); 1.350 + } 1.351 +} 1.352 + 1.353 +void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, 1.354 + uint8_t *dst, ptrdiff_t dst_stride, 1.355 + const int16_t *filter_x, int x_step_q4, 1.356 + const int16_t *filter_y, int y_step_q4, 1.357 + int w, int h) { 1.358 + if (x_step_q4 == 16 && filter_x[3] != 128) { 1.359 + while (w >= 16) { 1.360 + vp9_filter_block1d16_h8_avg_sse2(src, src_stride, 1.361 + dst, dst_stride, 1.362 + h, filter_x); 1.363 + src += 16; 1.364 + dst += 16; 1.365 + w -= 16; 1.366 + } 1.367 + while (w >= 8) { 1.368 + vp9_filter_block1d8_h8_avg_sse2(src, src_stride, 1.369 + dst, dst_stride, 1.370 + h, filter_x); 1.371 + src += 8; 1.372 + dst += 8; 1.373 + w -= 8; 1.374 + } 1.375 + while (w >= 4) { 1.376 + vp9_filter_block1d4_h8_avg_sse2(src, src_stride, 1.377 + dst, dst_stride, 1.378 + h, filter_x); 1.379 + src += 4; 1.380 + dst += 4; 1.381 + w -= 4; 1.382 + } 1.383 + } 1.384 + if (w) { 1.385 + vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, 1.386 + filter_x, x_step_q4, filter_y, y_step_q4, 1.387 + w, h); 1.388 + } 1.389 +} 1.390 + 1.391 +void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, 1.392 + uint8_t *dst, ptrdiff_t dst_stride, 1.393 + const int16_t *filter_x, int x_step_q4, 1.394 + const int16_t *filter_y, int y_step_q4, 1.395 + int w, int h) { 1.396 + if (y_step_q4 == 16 && filter_y[3] != 128) { 1.397 + while (w >= 16) { 1.398 + vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride, 1.399 + dst, dst_stride, 1.400 + h, filter_y); 1.401 + src += 16; 1.402 + dst += 16; 1.403 + w -= 16; 1.404 + } 1.405 + while (w >= 8) { 1.406 + vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride, 1.407 + dst, dst_stride, 1.408 + h, filter_y); 1.409 + src += 8; 1.410 + dst += 8; 1.411 + w -= 8; 1.412 + } 1.413 + while (w >= 4) { 1.414 + vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride, 1.415 + dst, dst_stride, 1.416 + h, filter_y); 1.417 + src += 4; 1.418 + dst += 4; 1.419 + w -= 4; 1.420 + } 1.421 + } 1.422 + if (w) { 1.423 + vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, 1.424 + filter_x, x_step_q4, filter_y, y_step_q4, 1.425 + w, h); 1.426 + } 1.427 +} 1.428 + 1.429 +void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, 1.430 + uint8_t *dst, ptrdiff_t dst_stride, 1.431 + const int16_t *filter_x, int x_step_q4, 1.432 + const int16_t *filter_y, int y_step_q4, 1.433 + int w, int h) { 1.434 + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); 1.435 + 1.436 + assert(w <= 64); 1.437 + assert(h <= 64); 1.438 + if (x_step_q4 == 16 && y_step_q4 == 16) { 1.439 + vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, 1.440 + filter_x, x_step_q4, filter_y, y_step_q4, 1.441 + w, h + 7); 1.442 + vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, 1.443 + filter_x, x_step_q4, filter_y, y_step_q4, w, h); 1.444 + } else { 1.445 + vp9_convolve8_c(src, src_stride, dst, dst_stride, 1.446 + filter_x, x_step_q4, filter_y, y_step_q4, w, h); 1.447 + } 1.448 +} 1.449 + 1.450 +void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, 1.451 + uint8_t *dst, ptrdiff_t dst_stride, 1.452 + const int16_t *filter_x, int x_step_q4, 1.453 + const int16_t *filter_y, int y_step_q4, 1.454 + int w, int h) { 1.455 + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); 1.456 + 1.457 + assert(w <= 64); 1.458 + assert(h <= 64); 1.459 + if (x_step_q4 == 16 && y_step_q4 == 16) { 1.460 + vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, 1.461 + filter_x, x_step_q4, filter_y, y_step_q4, 1.462 + w, h + 7); 1.463 + vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, 1.464 + filter_x, x_step_q4, filter_y, y_step_q4, 1.465 + w, h); 1.466 + } else { 1.467 + vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, 1.468 + filter_x, x_step_q4, filter_y, y_step_q4, w, h); 1.469 + } 1.470 +} 1.471 +#endif