1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/variance_sse2.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,558 @@ 1.4 +/* 1.5 + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include "vpx_config.h" 1.15 +#include "vp8/common/variance.h" 1.16 +#include "vp8/common/pragmas.h" 1.17 +#include "vpx_ports/mem.h" 1.18 +#include "vp8/common/x86/filter_x86.h" 1.19 + 1.20 +extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); 1.21 +extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); 1.22 +extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); 1.23 +extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); 1.24 + 1.25 +extern void vp8_filter_block2d_bil4x4_var_mmx 1.26 +( 1.27 + const unsigned char *ref_ptr, 1.28 + int ref_pixels_per_line, 1.29 + const unsigned char *src_ptr, 1.30 + int src_pixels_per_line, 1.31 + const short *HFilter, 1.32 + const short *VFilter, 1.33 + int *sum, 1.34 + unsigned int *sumsquared 1.35 +); 1.36 + 1.37 +extern unsigned int vp8_get4x4var_mmx 1.38 +( 1.39 + const unsigned char *src_ptr, 1.40 + int source_stride, 1.41 + const unsigned char *ref_ptr, 1.42 + int recon_stride, 1.43 + unsigned int *SSE, 1.44 + int *Sum 1.45 +); 1.46 + 1.47 +unsigned int vp8_get_mb_ss_sse2 1.48 +( 1.49 + const short *src_ptr 1.50 +); 1.51 +unsigned int vp8_get16x16var_sse2 1.52 +( 1.53 + const unsigned char *src_ptr, 1.54 + int source_stride, 1.55 + const unsigned char *ref_ptr, 1.56 + int recon_stride, 1.57 + unsigned int *SSE, 1.58 + int *Sum 1.59 +); 1.60 +unsigned int vp8_get8x8var_sse2 1.61 +( 1.62 + const unsigned char *src_ptr, 1.63 + int source_stride, 1.64 + const unsigned char *ref_ptr, 1.65 + int recon_stride, 1.66 + unsigned int *SSE, 1.67 + int *Sum 1.68 +); 1.69 +void vp8_filter_block2d_bil_var_sse2 1.70 +( 1.71 + const unsigned char *ref_ptr, 1.72 + int ref_pixels_per_line, 1.73 + const unsigned char *src_ptr, 1.74 + int src_pixels_per_line, 1.75 + unsigned int Height, 1.76 + int xoffset, 1.77 + int yoffset, 1.78 + int *sum, 1.79 + unsigned int *sumsquared 1.80 +); 1.81 +void vp8_half_horiz_vert_variance8x_h_sse2 1.82 +( 1.83 + const unsigned char *ref_ptr, 1.84 + int ref_pixels_per_line, 1.85 + const unsigned char *src_ptr, 1.86 + int src_pixels_per_line, 1.87 + unsigned int Height, 1.88 + int *sum, 1.89 + unsigned int *sumsquared 1.90 +); 1.91 +void vp8_half_horiz_vert_variance16x_h_sse2 1.92 +( 1.93 + const unsigned char *ref_ptr, 1.94 + int ref_pixels_per_line, 1.95 + const unsigned char *src_ptr, 1.96 + int src_pixels_per_line, 1.97 + unsigned int Height, 1.98 + int *sum, 1.99 + unsigned int *sumsquared 1.100 +); 1.101 +void vp8_half_horiz_variance8x_h_sse2 1.102 +( 1.103 + const unsigned char *ref_ptr, 1.104 + int ref_pixels_per_line, 1.105 + const unsigned char *src_ptr, 1.106 + int src_pixels_per_line, 1.107 + unsigned int Height, 1.108 + int *sum, 1.109 + unsigned int *sumsquared 1.110 +); 1.111 +void vp8_half_horiz_variance16x_h_sse2 1.112 +( 1.113 + const unsigned char *ref_ptr, 1.114 + int ref_pixels_per_line, 1.115 + const unsigned char *src_ptr, 1.116 + int src_pixels_per_line, 1.117 + unsigned int Height, 1.118 + int *sum, 1.119 + unsigned int *sumsquared 1.120 +); 1.121 +void vp8_half_vert_variance8x_h_sse2 1.122 +( 1.123 + const unsigned char *ref_ptr, 1.124 + int ref_pixels_per_line, 1.125 + const unsigned char *src_ptr, 1.126 + int src_pixels_per_line, 1.127 + unsigned int Height, 1.128 + int *sum, 1.129 + unsigned int *sumsquared 1.130 +); 1.131 +void vp8_half_vert_variance16x_h_sse2 1.132 +( 1.133 + const unsigned char *ref_ptr, 1.134 + int ref_pixels_per_line, 1.135 + const unsigned char *src_ptr, 1.136 + int src_pixels_per_line, 1.137 + unsigned int Height, 1.138 + int *sum, 1.139 + unsigned int *sumsquared 1.140 +); 1.141 + 1.142 +unsigned int vp8_variance4x4_wmt( 1.143 + const unsigned char *src_ptr, 1.144 + int source_stride, 1.145 + const unsigned char *ref_ptr, 1.146 + int recon_stride, 1.147 + unsigned int *sse) 1.148 +{ 1.149 + unsigned int var; 1.150 + int avg; 1.151 + 1.152 + vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; 1.153 + *sse = var; 1.154 + return (var - (((unsigned int)avg * avg) >> 4)); 1.155 + 1.156 +} 1.157 + 1.158 +unsigned int vp8_variance8x8_wmt 1.159 +( 1.160 + const unsigned char *src_ptr, 1.161 + int source_stride, 1.162 + const unsigned char *ref_ptr, 1.163 + int recon_stride, 1.164 + unsigned int *sse) 1.165 +{ 1.166 + unsigned int var; 1.167 + int avg; 1.168 + 1.169 + vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; 1.170 + *sse = var; 1.171 + return (var - (((unsigned int)avg * avg) >> 6)); 1.172 + 1.173 +} 1.174 + 1.175 + 1.176 +unsigned int vp8_variance16x16_wmt 1.177 +( 1.178 + const unsigned char *src_ptr, 1.179 + int source_stride, 1.180 + const unsigned char *ref_ptr, 1.181 + int recon_stride, 1.182 + unsigned int *sse) 1.183 +{ 1.184 + unsigned int sse0; 1.185 + int sum0; 1.186 + 1.187 + 1.188 + vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 1.189 + *sse = sse0; 1.190 + return (sse0 - (((unsigned int)sum0 * sum0) >> 8)); 1.191 +} 1.192 +unsigned int vp8_mse16x16_wmt( 1.193 + const unsigned char *src_ptr, 1.194 + int source_stride, 1.195 + const unsigned char *ref_ptr, 1.196 + int recon_stride, 1.197 + unsigned int *sse) 1.198 +{ 1.199 + 1.200 + unsigned int sse0; 1.201 + int sum0; 1.202 + vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 1.203 + *sse = sse0; 1.204 + return sse0; 1.205 + 1.206 +} 1.207 + 1.208 + 1.209 +unsigned int vp8_variance16x8_wmt 1.210 +( 1.211 + const unsigned char *src_ptr, 1.212 + int source_stride, 1.213 + const unsigned char *ref_ptr, 1.214 + int recon_stride, 1.215 + unsigned int *sse) 1.216 +{ 1.217 + unsigned int sse0, sse1, var; 1.218 + int sum0, sum1, avg; 1.219 + 1.220 + vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 1.221 + vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); 1.222 + 1.223 + var = sse0 + sse1; 1.224 + avg = sum0 + sum1; 1.225 + *sse = var; 1.226 + return (var - (((unsigned int)avg * avg) >> 7)); 1.227 + 1.228 +} 1.229 + 1.230 +unsigned int vp8_variance8x16_wmt 1.231 +( 1.232 + const unsigned char *src_ptr, 1.233 + int source_stride, 1.234 + const unsigned char *ref_ptr, 1.235 + int recon_stride, 1.236 + unsigned int *sse) 1.237 +{ 1.238 + unsigned int sse0, sse1, var; 1.239 + int sum0, sum1, avg; 1.240 + 1.241 + vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 1.242 + vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; 1.243 + 1.244 + var = sse0 + sse1; 1.245 + avg = sum0 + sum1; 1.246 + *sse = var; 1.247 + return (var - (((unsigned int)avg * avg) >> 7)); 1.248 + 1.249 +} 1.250 + 1.251 +unsigned int vp8_sub_pixel_variance4x4_wmt 1.252 +( 1.253 + const unsigned char *src_ptr, 1.254 + int src_pixels_per_line, 1.255 + int xoffset, 1.256 + int yoffset, 1.257 + const unsigned char *dst_ptr, 1.258 + int dst_pixels_per_line, 1.259 + unsigned int *sse 1.260 +) 1.261 +{ 1.262 + int xsum; 1.263 + unsigned int xxsum; 1.264 + vp8_filter_block2d_bil4x4_var_mmx( 1.265 + src_ptr, src_pixels_per_line, 1.266 + dst_ptr, dst_pixels_per_line, 1.267 + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], 1.268 + &xsum, &xxsum 1.269 + ); 1.270 + *sse = xxsum; 1.271 + return (xxsum - (((unsigned int)xsum * xsum) >> 4)); 1.272 +} 1.273 + 1.274 + 1.275 +unsigned int vp8_sub_pixel_variance8x8_wmt 1.276 +( 1.277 + const unsigned char *src_ptr, 1.278 + int src_pixels_per_line, 1.279 + int xoffset, 1.280 + int yoffset, 1.281 + const unsigned char *dst_ptr, 1.282 + int dst_pixels_per_line, 1.283 + unsigned int *sse 1.284 +) 1.285 +{ 1.286 + int xsum; 1.287 + unsigned int xxsum; 1.288 + 1.289 + if (xoffset == 4 && yoffset == 0) 1.290 + { 1.291 + vp8_half_horiz_variance8x_h_sse2( 1.292 + src_ptr, src_pixels_per_line, 1.293 + dst_ptr, dst_pixels_per_line, 8, 1.294 + &xsum, &xxsum); 1.295 + } 1.296 + else if (xoffset == 0 && yoffset == 4) 1.297 + { 1.298 + vp8_half_vert_variance8x_h_sse2( 1.299 + src_ptr, src_pixels_per_line, 1.300 + dst_ptr, dst_pixels_per_line, 8, 1.301 + &xsum, &xxsum); 1.302 + } 1.303 + else if (xoffset == 4 && yoffset == 4) 1.304 + { 1.305 + vp8_half_horiz_vert_variance8x_h_sse2( 1.306 + src_ptr, src_pixels_per_line, 1.307 + dst_ptr, dst_pixels_per_line, 8, 1.308 + &xsum, &xxsum); 1.309 + } 1.310 + else 1.311 + { 1.312 + vp8_filter_block2d_bil_var_sse2( 1.313 + src_ptr, src_pixels_per_line, 1.314 + dst_ptr, dst_pixels_per_line, 8, 1.315 + xoffset, yoffset, 1.316 + &xsum, &xxsum); 1.317 + } 1.318 + 1.319 + *sse = xxsum; 1.320 + return (xxsum - (((unsigned int)xsum * xsum) >> 6)); 1.321 +} 1.322 + 1.323 +unsigned int vp8_sub_pixel_variance16x16_wmt 1.324 +( 1.325 + const unsigned char *src_ptr, 1.326 + int src_pixels_per_line, 1.327 + int xoffset, 1.328 + int yoffset, 1.329 + const unsigned char *dst_ptr, 1.330 + int dst_pixels_per_line, 1.331 + unsigned int *sse 1.332 +) 1.333 +{ 1.334 + int xsum0, xsum1; 1.335 + unsigned int xxsum0, xxsum1; 1.336 + 1.337 + 1.338 + /* note we could avoid these if statements if the calling function 1.339 + * just called the appropriate functions inside. 1.340 + */ 1.341 + if (xoffset == 4 && yoffset == 0) 1.342 + { 1.343 + vp8_half_horiz_variance16x_h_sse2( 1.344 + src_ptr, src_pixels_per_line, 1.345 + dst_ptr, dst_pixels_per_line, 16, 1.346 + &xsum0, &xxsum0); 1.347 + } 1.348 + else if (xoffset == 0 && yoffset == 4) 1.349 + { 1.350 + vp8_half_vert_variance16x_h_sse2( 1.351 + src_ptr, src_pixels_per_line, 1.352 + dst_ptr, dst_pixels_per_line, 16, 1.353 + &xsum0, &xxsum0); 1.354 + } 1.355 + else if (xoffset == 4 && yoffset == 4) 1.356 + { 1.357 + vp8_half_horiz_vert_variance16x_h_sse2( 1.358 + src_ptr, src_pixels_per_line, 1.359 + dst_ptr, dst_pixels_per_line, 16, 1.360 + &xsum0, &xxsum0); 1.361 + } 1.362 + else 1.363 + { 1.364 + vp8_filter_block2d_bil_var_sse2( 1.365 + src_ptr, src_pixels_per_line, 1.366 + dst_ptr, dst_pixels_per_line, 16, 1.367 + xoffset, yoffset, 1.368 + &xsum0, &xxsum0 1.369 + ); 1.370 + 1.371 + vp8_filter_block2d_bil_var_sse2( 1.372 + src_ptr + 8, src_pixels_per_line, 1.373 + dst_ptr + 8, dst_pixels_per_line, 16, 1.374 + xoffset, yoffset, 1.375 + &xsum1, &xxsum1 1.376 + ); 1.377 + xsum0 += xsum1; 1.378 + xxsum0 += xxsum1; 1.379 + } 1.380 + 1.381 + *sse = xxsum0; 1.382 + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); 1.383 +} 1.384 + 1.385 +unsigned int vp8_sub_pixel_mse16x16_wmt( 1.386 + const unsigned char *src_ptr, 1.387 + int src_pixels_per_line, 1.388 + int xoffset, 1.389 + int yoffset, 1.390 + const unsigned char *dst_ptr, 1.391 + int dst_pixels_per_line, 1.392 + unsigned int *sse 1.393 +) 1.394 +{ 1.395 + vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); 1.396 + return *sse; 1.397 +} 1.398 + 1.399 +unsigned int vp8_sub_pixel_variance16x8_wmt 1.400 +( 1.401 + const unsigned char *src_ptr, 1.402 + int src_pixels_per_line, 1.403 + int xoffset, 1.404 + int yoffset, 1.405 + const unsigned char *dst_ptr, 1.406 + int dst_pixels_per_line, 1.407 + unsigned int *sse 1.408 + 1.409 +) 1.410 +{ 1.411 + int xsum0, xsum1; 1.412 + unsigned int xxsum0, xxsum1; 1.413 + 1.414 + if (xoffset == 4 && yoffset == 0) 1.415 + { 1.416 + vp8_half_horiz_variance16x_h_sse2( 1.417 + src_ptr, src_pixels_per_line, 1.418 + dst_ptr, dst_pixels_per_line, 8, 1.419 + &xsum0, &xxsum0); 1.420 + } 1.421 + else if (xoffset == 0 && yoffset == 4) 1.422 + { 1.423 + vp8_half_vert_variance16x_h_sse2( 1.424 + src_ptr, src_pixels_per_line, 1.425 + dst_ptr, dst_pixels_per_line, 8, 1.426 + &xsum0, &xxsum0); 1.427 + } 1.428 + else if (xoffset == 4 && yoffset == 4) 1.429 + { 1.430 + vp8_half_horiz_vert_variance16x_h_sse2( 1.431 + src_ptr, src_pixels_per_line, 1.432 + dst_ptr, dst_pixels_per_line, 8, 1.433 + &xsum0, &xxsum0); 1.434 + } 1.435 + else 1.436 + { 1.437 + vp8_filter_block2d_bil_var_sse2( 1.438 + src_ptr, src_pixels_per_line, 1.439 + dst_ptr, dst_pixels_per_line, 8, 1.440 + xoffset, yoffset, 1.441 + &xsum0, &xxsum0); 1.442 + 1.443 + vp8_filter_block2d_bil_var_sse2( 1.444 + src_ptr + 8, src_pixels_per_line, 1.445 + dst_ptr + 8, dst_pixels_per_line, 8, 1.446 + xoffset, yoffset, 1.447 + &xsum1, &xxsum1); 1.448 + xsum0 += xsum1; 1.449 + xxsum0 += xxsum1; 1.450 + } 1.451 + 1.452 + *sse = xxsum0; 1.453 + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); 1.454 +} 1.455 + 1.456 +unsigned int vp8_sub_pixel_variance8x16_wmt 1.457 +( 1.458 + const unsigned char *src_ptr, 1.459 + int src_pixels_per_line, 1.460 + int xoffset, 1.461 + int yoffset, 1.462 + const unsigned char *dst_ptr, 1.463 + int dst_pixels_per_line, 1.464 + unsigned int *sse 1.465 +) 1.466 +{ 1.467 + int xsum; 1.468 + unsigned int xxsum; 1.469 + 1.470 + if (xoffset == 4 && yoffset == 0) 1.471 + { 1.472 + vp8_half_horiz_variance8x_h_sse2( 1.473 + src_ptr, src_pixels_per_line, 1.474 + dst_ptr, dst_pixels_per_line, 16, 1.475 + &xsum, &xxsum); 1.476 + } 1.477 + else if (xoffset == 0 && yoffset == 4) 1.478 + { 1.479 + vp8_half_vert_variance8x_h_sse2( 1.480 + src_ptr, src_pixels_per_line, 1.481 + dst_ptr, dst_pixels_per_line, 16, 1.482 + &xsum, &xxsum); 1.483 + } 1.484 + else if (xoffset == 4 && yoffset == 4) 1.485 + { 1.486 + vp8_half_horiz_vert_variance8x_h_sse2( 1.487 + src_ptr, src_pixels_per_line, 1.488 + dst_ptr, dst_pixels_per_line, 16, 1.489 + &xsum, &xxsum); 1.490 + } 1.491 + else 1.492 + { 1.493 + vp8_filter_block2d_bil_var_sse2( 1.494 + src_ptr, src_pixels_per_line, 1.495 + dst_ptr, dst_pixels_per_line, 16, 1.496 + xoffset, yoffset, 1.497 + &xsum, &xxsum); 1.498 + } 1.499 + 1.500 + *sse = xxsum; 1.501 + return (xxsum - (((unsigned int)xsum * xsum) >> 7)); 1.502 +} 1.503 + 1.504 + 1.505 +unsigned int vp8_variance_halfpixvar16x16_h_wmt( 1.506 + const unsigned char *src_ptr, 1.507 + int src_pixels_per_line, 1.508 + const unsigned char *dst_ptr, 1.509 + int dst_pixels_per_line, 1.510 + unsigned int *sse) 1.511 +{ 1.512 + int xsum0; 1.513 + unsigned int xxsum0; 1.514 + 1.515 + vp8_half_horiz_variance16x_h_sse2( 1.516 + src_ptr, src_pixels_per_line, 1.517 + dst_ptr, dst_pixels_per_line, 16, 1.518 + &xsum0, &xxsum0); 1.519 + 1.520 + *sse = xxsum0; 1.521 + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); 1.522 +} 1.523 + 1.524 + 1.525 +unsigned int vp8_variance_halfpixvar16x16_v_wmt( 1.526 + const unsigned char *src_ptr, 1.527 + int src_pixels_per_line, 1.528 + const unsigned char *dst_ptr, 1.529 + int dst_pixels_per_line, 1.530 + unsigned int *sse) 1.531 +{ 1.532 + int xsum0; 1.533 + unsigned int xxsum0; 1.534 + vp8_half_vert_variance16x_h_sse2( 1.535 + src_ptr, src_pixels_per_line, 1.536 + dst_ptr, dst_pixels_per_line, 16, 1.537 + &xsum0, &xxsum0); 1.538 + 1.539 + *sse = xxsum0; 1.540 + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); 1.541 +} 1.542 + 1.543 + 1.544 +unsigned int vp8_variance_halfpixvar16x16_hv_wmt( 1.545 + const unsigned char *src_ptr, 1.546 + int src_pixels_per_line, 1.547 + const unsigned char *dst_ptr, 1.548 + int dst_pixels_per_line, 1.549 + unsigned int *sse) 1.550 +{ 1.551 + int xsum0; 1.552 + unsigned int xxsum0; 1.553 + 1.554 + vp8_half_horiz_vert_variance16x_h_sse2( 1.555 + src_ptr, src_pixels_per_line, 1.556 + dst_ptr, dst_pixels_per_line, 16, 1.557 + &xsum0, &xxsum0); 1.558 + 1.559 + *sse = xxsum0; 1.560 + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); 1.561 +}