1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/vp8_asm_stubs.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,625 @@ 1.4 +/* 1.5 + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 + 1.15 +#include "vpx_config.h" 1.16 +#include "vp8_rtcd.h" 1.17 +#include "vpx_ports/mem.h" 1.18 +#include "filter_x86.h" 1.19 + 1.20 +extern const short vp8_six_tap_mmx[8][6*8]; 1.21 + 1.22 +extern void vp8_filter_block1d_h6_mmx 1.23 +( 1.24 + unsigned char *src_ptr, 1.25 + unsigned short *output_ptr, 1.26 + unsigned int src_pixels_per_line, 1.27 + unsigned int pixel_step, 1.28 + unsigned int output_height, 1.29 + unsigned int output_width, 1.30 + const short *vp8_filter 1.31 +); 1.32 +extern void vp8_filter_block1dc_v6_mmx 1.33 +( 1.34 + unsigned short *src_ptr, 1.35 + unsigned char *output_ptr, 1.36 + int output_pitch, 1.37 + unsigned int pixels_per_line, 1.38 + unsigned int pixel_step, 1.39 + unsigned int output_height, 1.40 + unsigned int output_width, 1.41 + const short *vp8_filter 1.42 +); 1.43 +extern void vp8_filter_block1d8_h6_sse2 1.44 +( 1.45 + unsigned char *src_ptr, 1.46 + unsigned short *output_ptr, 1.47 + unsigned int src_pixels_per_line, 1.48 + unsigned int pixel_step, 1.49 + unsigned int output_height, 1.50 + unsigned int output_width, 1.51 + const short *vp8_filter 1.52 +); 1.53 +extern void vp8_filter_block1d16_h6_sse2 1.54 +( 1.55 + unsigned char *src_ptr, 1.56 + unsigned short *output_ptr, 1.57 + unsigned int src_pixels_per_line, 1.58 + unsigned int pixel_step, 1.59 + unsigned int output_height, 1.60 + unsigned int output_width, 1.61 + const short *vp8_filter 1.62 +); 1.63 +extern void vp8_filter_block1d8_v6_sse2 1.64 +( 1.65 + unsigned short *src_ptr, 1.66 + unsigned char *output_ptr, 1.67 + int dst_ptich, 1.68 + unsigned int pixels_per_line, 1.69 + unsigned int pixel_step, 1.70 + unsigned int output_height, 1.71 + unsigned int output_width, 1.72 + const short *vp8_filter 1.73 +); 1.74 +extern void vp8_filter_block1d16_v6_sse2 1.75 +( 1.76 + unsigned short *src_ptr, 1.77 + unsigned char *output_ptr, 1.78 + int dst_ptich, 1.79 + unsigned int pixels_per_line, 1.80 + unsigned int pixel_step, 1.81 + unsigned int output_height, 1.82 + unsigned int output_width, 1.83 + const short *vp8_filter 1.84 +); 1.85 +extern void vp8_unpack_block1d16_h6_sse2 1.86 +( 1.87 + unsigned char *src_ptr, 1.88 + unsigned short *output_ptr, 1.89 + unsigned int src_pixels_per_line, 1.90 + unsigned int output_height, 1.91 + unsigned int output_width 1.92 +); 1.93 +extern void vp8_filter_block1d8_h6_only_sse2 1.94 +( 1.95 + unsigned char *src_ptr, 1.96 + unsigned int src_pixels_per_line, 1.97 + unsigned char *output_ptr, 1.98 + int dst_ptich, 1.99 + unsigned int output_height, 1.100 + const short *vp8_filter 1.101 +); 1.102 +extern void vp8_filter_block1d16_h6_only_sse2 1.103 +( 1.104 + unsigned char *src_ptr, 1.105 + unsigned int src_pixels_per_line, 1.106 + unsigned char *output_ptr, 1.107 + int dst_ptich, 1.108 + unsigned int output_height, 1.109 + const short *vp8_filter 1.110 +); 1.111 +extern void vp8_filter_block1d8_v6_only_sse2 1.112 +( 1.113 + unsigned char *src_ptr, 1.114 + unsigned int src_pixels_per_line, 1.115 + unsigned char *output_ptr, 1.116 + int dst_ptich, 1.117 + unsigned int output_height, 1.118 + const short *vp8_filter 1.119 +); 1.120 + 1.121 + 1.122 +#if HAVE_MMX 1.123 +void vp8_sixtap_predict4x4_mmx 1.124 +( 1.125 + unsigned char *src_ptr, 1.126 + int src_pixels_per_line, 1.127 + int xoffset, 1.128 + int yoffset, 1.129 + unsigned char *dst_ptr, 1.130 + int dst_pitch 1.131 +) 1.132 +{ 1.133 + DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16); /* Temp data bufffer used in filtering */ 1.134 + const short *HFilter, *VFilter; 1.135 + HFilter = vp8_six_tap_mmx[xoffset]; 1.136 + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter); 1.137 + VFilter = vp8_six_tap_mmx[yoffset]; 1.138 + vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter); 1.139 + 1.140 +} 1.141 + 1.142 + 1.143 +void vp8_sixtap_predict16x16_mmx 1.144 +( 1.145 + unsigned char *src_ptr, 1.146 + int src_pixels_per_line, 1.147 + int xoffset, 1.148 + int yoffset, 1.149 + unsigned char *dst_ptr, 1.150 + int dst_pitch 1.151 +) 1.152 +{ 1.153 + 1.154 + DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */ 1.155 + 1.156 + const short *HFilter, *VFilter; 1.157 + 1.158 + 1.159 + HFilter = vp8_six_tap_mmx[xoffset]; 1.160 + 1.161 + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); 1.162 + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter); 1.163 + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter); 1.164 + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter); 1.165 + 1.166 + VFilter = vp8_six_tap_mmx[yoffset]; 1.167 + vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, 16, VFilter); 1.168 + vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter); 1.169 + vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter); 1.170 + vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter); 1.171 + 1.172 +} 1.173 + 1.174 + 1.175 +void vp8_sixtap_predict8x8_mmx 1.176 +( 1.177 + unsigned char *src_ptr, 1.178 + int src_pixels_per_line, 1.179 + int xoffset, 1.180 + int yoffset, 1.181 + unsigned char *dst_ptr, 1.182 + int dst_pitch 1.183 +) 1.184 +{ 1.185 + 1.186 + DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ 1.187 + 1.188 + const short *HFilter, *VFilter; 1.189 + 1.190 + HFilter = vp8_six_tap_mmx[xoffset]; 1.191 + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); 1.192 + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter); 1.193 + 1.194 + VFilter = vp8_six_tap_mmx[yoffset]; 1.195 + vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, 8, VFilter); 1.196 + vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter); 1.197 + 1.198 +} 1.199 + 1.200 + 1.201 +void vp8_sixtap_predict8x4_mmx 1.202 +( 1.203 + unsigned char *src_ptr, 1.204 + int src_pixels_per_line, 1.205 + int xoffset, 1.206 + int yoffset, 1.207 + unsigned char *dst_ptr, 1.208 + int dst_pitch 1.209 +) 1.210 +{ 1.211 + 1.212 + DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ 1.213 + 1.214 + const short *HFilter, *VFilter; 1.215 + 1.216 + HFilter = vp8_six_tap_mmx[xoffset]; 1.217 + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); 1.218 + vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter); 1.219 + 1.220 + VFilter = vp8_six_tap_mmx[yoffset]; 1.221 + vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, 8, VFilter); 1.222 + vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter); 1.223 + 1.224 +} 1.225 + 1.226 + 1.227 + 1.228 +void vp8_bilinear_predict16x16_mmx 1.229 +( 1.230 + unsigned char *src_ptr, 1.231 + int src_pixels_per_line, 1.232 + int xoffset, 1.233 + int yoffset, 1.234 + unsigned char *dst_ptr, 1.235 + int dst_pitch 1.236 +) 1.237 +{ 1.238 + vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pitch); 1.239 + vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch); 1.240 + vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8, dst_pitch); 1.241 + vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch); 1.242 +} 1.243 +#endif 1.244 + 1.245 + 1.246 +#if HAVE_SSE2 1.247 +void vp8_sixtap_predict16x16_sse2 1.248 +( 1.249 + unsigned char *src_ptr, 1.250 + int src_pixels_per_line, 1.251 + int xoffset, 1.252 + int yoffset, 1.253 + unsigned char *dst_ptr, 1.254 + int dst_pitch 1.255 + 1.256 +) 1.257 +{ 1.258 + DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */ 1.259 + 1.260 + const short *HFilter, *VFilter; 1.261 + 1.262 + if (xoffset) 1.263 + { 1.264 + if (yoffset) 1.265 + { 1.266 + HFilter = vp8_six_tap_mmx[xoffset]; 1.267 + vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); 1.268 + VFilter = vp8_six_tap_mmx[yoffset]; 1.269 + vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter); 1.270 + } 1.271 + else 1.272 + { 1.273 + /* First-pass only */ 1.274 + HFilter = vp8_six_tap_mmx[xoffset]; 1.275 + vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter); 1.276 + } 1.277 + } 1.278 + else 1.279 + { 1.280 + /* Second-pass only */ 1.281 + VFilter = vp8_six_tap_mmx[yoffset]; 1.282 + vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 21, 32); 1.283 + vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter); 1.284 + } 1.285 +} 1.286 + 1.287 + 1.288 +void vp8_sixtap_predict8x8_sse2 1.289 +( 1.290 + unsigned char *src_ptr, 1.291 + int src_pixels_per_line, 1.292 + int xoffset, 1.293 + int yoffset, 1.294 + unsigned char *dst_ptr, 1.295 + int dst_pitch 1.296 +) 1.297 +{ 1.298 + DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ 1.299 + const short *HFilter, *VFilter; 1.300 + 1.301 + if (xoffset) 1.302 + { 1.303 + if (yoffset) 1.304 + { 1.305 + HFilter = vp8_six_tap_mmx[xoffset]; 1.306 + vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); 1.307 + VFilter = vp8_six_tap_mmx[yoffset]; 1.308 + vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, dst_pitch, VFilter); 1.309 + } 1.310 + else 1.311 + { 1.312 + /* First-pass only */ 1.313 + HFilter = vp8_six_tap_mmx[xoffset]; 1.314 + vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter); 1.315 + } 1.316 + } 1.317 + else 1.318 + { 1.319 + /* Second-pass only */ 1.320 + VFilter = vp8_six_tap_mmx[yoffset]; 1.321 + vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter); 1.322 + } 1.323 +} 1.324 + 1.325 + 1.326 +void vp8_sixtap_predict8x4_sse2 1.327 +( 1.328 + unsigned char *src_ptr, 1.329 + int src_pixels_per_line, 1.330 + int xoffset, 1.331 + int yoffset, 1.332 + unsigned char *dst_ptr, 1.333 + int dst_pitch 1.334 +) 1.335 +{ 1.336 + DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ 1.337 + const short *HFilter, *VFilter; 1.338 + 1.339 + if (xoffset) 1.340 + { 1.341 + if (yoffset) 1.342 + { 1.343 + HFilter = vp8_six_tap_mmx[xoffset]; 1.344 + vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); 1.345 + VFilter = vp8_six_tap_mmx[yoffset]; 1.346 + vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, dst_pitch, VFilter); 1.347 + } 1.348 + else 1.349 + { 1.350 + /* First-pass only */ 1.351 + HFilter = vp8_six_tap_mmx[xoffset]; 1.352 + vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter); 1.353 + } 1.354 + } 1.355 + else 1.356 + { 1.357 + /* Second-pass only */ 1.358 + VFilter = vp8_six_tap_mmx[yoffset]; 1.359 + vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter); 1.360 + } 1.361 +} 1.362 + 1.363 +#endif 1.364 + 1.365 +#if HAVE_SSSE3 1.366 + 1.367 +extern void vp8_filter_block1d8_h6_ssse3 1.368 +( 1.369 + unsigned char *src_ptr, 1.370 + unsigned int src_pixels_per_line, 1.371 + unsigned char *output_ptr, 1.372 + unsigned int output_pitch, 1.373 + unsigned int output_height, 1.374 + unsigned int vp8_filter_index 1.375 +); 1.376 + 1.377 +extern void vp8_filter_block1d16_h6_ssse3 1.378 +( 1.379 + unsigned char *src_ptr, 1.380 + unsigned int src_pixels_per_line, 1.381 + unsigned char *output_ptr, 1.382 + unsigned int output_pitch, 1.383 + unsigned int output_height, 1.384 + unsigned int vp8_filter_index 1.385 +); 1.386 + 1.387 +extern void vp8_filter_block1d16_v6_ssse3 1.388 +( 1.389 + unsigned char *src_ptr, 1.390 + unsigned int src_pitch, 1.391 + unsigned char *output_ptr, 1.392 + unsigned int out_pitch, 1.393 + unsigned int output_height, 1.394 + unsigned int vp8_filter_index 1.395 +); 1.396 + 1.397 +extern void vp8_filter_block1d8_v6_ssse3 1.398 +( 1.399 + unsigned char *src_ptr, 1.400 + unsigned int src_pitch, 1.401 + unsigned char *output_ptr, 1.402 + unsigned int out_pitch, 1.403 + unsigned int output_height, 1.404 + unsigned int vp8_filter_index 1.405 +); 1.406 + 1.407 +extern void vp8_filter_block1d4_h6_ssse3 1.408 +( 1.409 + unsigned char *src_ptr, 1.410 + unsigned int src_pixels_per_line, 1.411 + unsigned char *output_ptr, 1.412 + unsigned int output_pitch, 1.413 + unsigned int output_height, 1.414 + unsigned int vp8_filter_index 1.415 +); 1.416 + 1.417 +extern void vp8_filter_block1d4_v6_ssse3 1.418 +( 1.419 + unsigned char *src_ptr, 1.420 + unsigned int src_pitch, 1.421 + unsigned char *output_ptr, 1.422 + unsigned int out_pitch, 1.423 + unsigned int output_height, 1.424 + unsigned int vp8_filter_index 1.425 +); 1.426 + 1.427 +void vp8_sixtap_predict16x16_ssse3 1.428 +( 1.429 + unsigned char *src_ptr, 1.430 + int src_pixels_per_line, 1.431 + int xoffset, 1.432 + int yoffset, 1.433 + unsigned char *dst_ptr, 1.434 + int dst_pitch 1.435 + 1.436 +) 1.437 +{ 1.438 + DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24); 1.439 + 1.440 + if (xoffset) 1.441 + { 1.442 + if (yoffset) 1.443 + { 1.444 + vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), 1.445 + src_pixels_per_line, FData2, 1.446 + 16, 21, xoffset); 1.447 + vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch, 1.448 + 16, yoffset); 1.449 + } 1.450 + else 1.451 + { 1.452 + /* First-pass only */ 1.453 + vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, 1.454 + dst_ptr, dst_pitch, 16, xoffset); 1.455 + } 1.456 + } 1.457 + else 1.458 + { 1.459 + if (yoffset) 1.460 + { 1.461 + /* Second-pass only */ 1.462 + vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line), 1.463 + src_pixels_per_line, 1.464 + dst_ptr, dst_pitch, 16, yoffset); 1.465 + } 1.466 + else 1.467 + { 1.468 + /* ssse3 second-pass only function couldn't handle (xoffset==0 && 1.469 + * yoffset==0) case correctly. Add copy function here to guarantee 1.470 + * six-tap function handles all possible offsets. */ 1.471 + vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); 1.472 + } 1.473 + } 1.474 +} 1.475 + 1.476 +void vp8_sixtap_predict8x8_ssse3 1.477 +( 1.478 + unsigned char *src_ptr, 1.479 + int src_pixels_per_line, 1.480 + int xoffset, 1.481 + int yoffset, 1.482 + unsigned char *dst_ptr, 1.483 + int dst_pitch 1.484 +) 1.485 +{ 1.486 + DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256); 1.487 + 1.488 + if (xoffset) 1.489 + { 1.490 + if (yoffset) 1.491 + { 1.492 + vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), 1.493 + src_pixels_per_line, FData2, 1.494 + 8, 13, xoffset); 1.495 + vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 1.496 + 8, yoffset); 1.497 + } 1.498 + else 1.499 + { 1.500 + vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, 1.501 + dst_ptr, dst_pitch, 8, xoffset); 1.502 + } 1.503 + } 1.504 + else 1.505 + { 1.506 + if (yoffset) 1.507 + { 1.508 + /* Second-pass only */ 1.509 + vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), 1.510 + src_pixels_per_line, 1.511 + dst_ptr, dst_pitch, 8, yoffset); 1.512 + } 1.513 + else 1.514 + { 1.515 + /* ssse3 second-pass only function couldn't handle (xoffset==0 && 1.516 + * yoffset==0) case correctly. Add copy function here to guarantee 1.517 + * six-tap function handles all possible offsets. */ 1.518 + vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); 1.519 + } 1.520 + } 1.521 +} 1.522 + 1.523 + 1.524 +void vp8_sixtap_predict8x4_ssse3 1.525 +( 1.526 + unsigned char *src_ptr, 1.527 + int src_pixels_per_line, 1.528 + int xoffset, 1.529 + int yoffset, 1.530 + unsigned char *dst_ptr, 1.531 + int dst_pitch 1.532 +) 1.533 +{ 1.534 + DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256); 1.535 + 1.536 + if (xoffset) 1.537 + { 1.538 + if (yoffset) 1.539 + { 1.540 + vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), 1.541 + src_pixels_per_line, FData2, 1.542 + 8, 9, xoffset); 1.543 + vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, 1.544 + 4, yoffset); 1.545 + } 1.546 + else 1.547 + { 1.548 + /* First-pass only */ 1.549 + vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, 1.550 + dst_ptr, dst_pitch, 4, xoffset); 1.551 + } 1.552 + } 1.553 + else 1.554 + { 1.555 + if (yoffset) 1.556 + { 1.557 + /* Second-pass only */ 1.558 + vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), 1.559 + src_pixels_per_line, 1.560 + dst_ptr, dst_pitch, 4, yoffset); 1.561 + } 1.562 + else 1.563 + { 1.564 + /* ssse3 second-pass only function couldn't handle (xoffset==0 && 1.565 + * yoffset==0) case correctly. Add copy function here to guarantee 1.566 + * six-tap function handles all possible offsets. */ 1.567 + vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); 1.568 + } 1.569 + } 1.570 +} 1.571 + 1.572 +void vp8_sixtap_predict4x4_ssse3 1.573 +( 1.574 + unsigned char *src_ptr, 1.575 + int src_pixels_per_line, 1.576 + int xoffset, 1.577 + int yoffset, 1.578 + unsigned char *dst_ptr, 1.579 + int dst_pitch 1.580 +) 1.581 +{ 1.582 + DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9); 1.583 + 1.584 + if (xoffset) 1.585 + { 1.586 + if (yoffset) 1.587 + { 1.588 + vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), 1.589 + src_pixels_per_line, 1.590 + FData2, 4, 9, xoffset); 1.591 + vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, 1.592 + 4, yoffset); 1.593 + } 1.594 + else 1.595 + { 1.596 + vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, 1.597 + dst_ptr, dst_pitch, 4, xoffset); 1.598 + } 1.599 + } 1.600 + else 1.601 + { 1.602 + if (yoffset) 1.603 + { 1.604 + vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), 1.605 + src_pixels_per_line, 1.606 + dst_ptr, dst_pitch, 4, yoffset); 1.607 + } 1.608 + else 1.609 + { 1.610 + /* ssse3 second-pass only function couldn't handle (xoffset==0 && 1.611 + * yoffset==0) case correctly. Add copy function here to guarantee 1.612 + * six-tap function handles all possible offsets. */ 1.613 + int r; 1.614 + 1.615 + for (r = 0; r < 4; r++) 1.616 + { 1.617 + dst_ptr[0] = src_ptr[0]; 1.618 + dst_ptr[1] = src_ptr[1]; 1.619 + dst_ptr[2] = src_ptr[2]; 1.620 + dst_ptr[3] = src_ptr[3]; 1.621 + dst_ptr += dst_pitch; 1.622 + src_ptr += src_pixels_per_line; 1.623 + } 1.624 + } 1.625 + } 1.626 +} 1.627 + 1.628 +#endif