1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/variance_mmx.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,398 @@ 1.4 +/* 1.5 + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include "vpx_config.h" 1.15 +#include "vp8/common/variance.h" 1.16 +#include "vp8/common/pragmas.h" 1.17 +#include "vpx_ports/mem.h" 1.18 +#include "vp8/common/x86/filter_x86.h" 1.19 + 1.20 +extern void filter_block1d_h6_mmx 1.21 +( 1.22 + const unsigned char *src_ptr, 1.23 + unsigned short *output_ptr, 1.24 + unsigned int src_pixels_per_line, 1.25 + unsigned int pixel_step, 1.26 + unsigned int output_height, 1.27 + unsigned int output_width, 1.28 + short *filter 1.29 +); 1.30 +extern void filter_block1d_v6_mmx 1.31 +( 1.32 + const short *src_ptr, 1.33 + unsigned char *output_ptr, 1.34 + unsigned int pixels_per_line, 1.35 + unsigned int pixel_step, 1.36 + unsigned int output_height, 1.37 + unsigned int output_width, 1.38 + short *filter 1.39 +); 1.40 + 1.41 +extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr); 1.42 +extern unsigned int vp8_get8x8var_mmx 1.43 +( 1.44 + const unsigned char *src_ptr, 1.45 + int source_stride, 1.46 + const unsigned char *ref_ptr, 1.47 + int recon_stride, 1.48 + unsigned int *SSE, 1.49 + int *Sum 1.50 +); 1.51 +extern unsigned int vp8_get4x4var_mmx 1.52 +( 1.53 + const unsigned char *src_ptr, 1.54 + int source_stride, 1.55 + const unsigned char *ref_ptr, 1.56 + int recon_stride, 1.57 + unsigned int *SSE, 1.58 + int *Sum 1.59 +); 1.60 +extern void vp8_filter_block2d_bil4x4_var_mmx 1.61 +( 1.62 + const unsigned char *ref_ptr, 1.63 + int ref_pixels_per_line, 1.64 + const unsigned char *src_ptr, 1.65 + int src_pixels_per_line, 1.66 + const short *HFilter, 1.67 + const short *VFilter, 1.68 + int *sum, 1.69 + unsigned int *sumsquared 1.70 +); 1.71 +extern void vp8_filter_block2d_bil_var_mmx 1.72 +( 1.73 + const unsigned char *ref_ptr, 1.74 + int ref_pixels_per_line, 1.75 + const unsigned char *src_ptr, 1.76 + int src_pixels_per_line, 1.77 + unsigned int Height, 1.78 + const short *HFilter, 1.79 + const short *VFilter, 1.80 + int *sum, 1.81 + unsigned int *sumsquared 1.82 +); 1.83 + 1.84 + 1.85 +unsigned int vp8_variance4x4_mmx( 1.86 + const unsigned char *src_ptr, 1.87 + int source_stride, 1.88 + const unsigned char *ref_ptr, 1.89 + int recon_stride, 1.90 + unsigned int *sse) 1.91 +{ 1.92 + unsigned int var; 1.93 + int avg; 1.94 + 1.95 + vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; 1.96 + *sse = var; 1.97 + return (var - (((unsigned int)avg * avg) >> 4)); 1.98 + 1.99 +} 1.100 + 1.101 +unsigned int vp8_variance8x8_mmx( 1.102 + const unsigned char *src_ptr, 1.103 + int source_stride, 1.104 + const unsigned char *ref_ptr, 1.105 + int recon_stride, 1.106 + unsigned int *sse) 1.107 +{ 1.108 + unsigned int var; 1.109 + int avg; 1.110 + 1.111 + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; 1.112 + *sse = var; 1.113 + 1.114 + return (var - (((unsigned int)avg * avg) >> 6)); 1.115 + 1.116 +} 1.117 + 1.118 +unsigned int vp8_mse16x16_mmx( 1.119 + const unsigned char *src_ptr, 1.120 + int source_stride, 1.121 + const unsigned char *ref_ptr, 1.122 + int recon_stride, 1.123 + unsigned int *sse) 1.124 +{ 1.125 + unsigned int sse0, sse1, sse2, sse3, var; 1.126 + int sum0, sum1, sum2, sum3; 1.127 + 1.128 + 1.129 + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 1.130 + vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); 1.131 + vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; 1.132 + vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); 1.133 + 1.134 + var = sse0 + sse1 + sse2 + sse3; 1.135 + *sse = var; 1.136 + return var; 1.137 +} 1.138 + 1.139 + 1.140 +unsigned int vp8_variance16x16_mmx( 1.141 + const unsigned char *src_ptr, 1.142 + int source_stride, 1.143 + const unsigned char *ref_ptr, 1.144 + int recon_stride, 1.145 + unsigned int *sse) 1.146 +{ 1.147 + unsigned int sse0, sse1, sse2, sse3, var; 1.148 + int sum0, sum1, sum2, sum3, avg; 1.149 + 1.150 + 1.151 + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 1.152 + vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); 1.153 + vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ; 1.154 + vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3); 1.155 + 1.156 + var = sse0 + sse1 + sse2 + sse3; 1.157 + avg = sum0 + sum1 + sum2 + sum3; 1.158 + *sse = var; 1.159 + return (var - (((unsigned int)avg * avg) >> 8)); 1.160 +} 1.161 + 1.162 +unsigned int vp8_variance16x8_mmx( 1.163 + const unsigned char *src_ptr, 1.164 + int source_stride, 1.165 + const unsigned char *ref_ptr, 1.166 + int recon_stride, 1.167 + unsigned int *sse) 1.168 +{ 1.169 + unsigned int sse0, sse1, var; 1.170 + int sum0, sum1, avg; 1.171 + 1.172 + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 1.173 + vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); 1.174 + 1.175 + var = sse0 + sse1; 1.176 + avg = sum0 + sum1; 1.177 + *sse = var; 1.178 + return (var - (((unsigned int)avg * avg) >> 7)); 1.179 + 1.180 +} 1.181 + 1.182 + 1.183 +unsigned int vp8_variance8x16_mmx( 1.184 + const unsigned char *src_ptr, 1.185 + int source_stride, 1.186 + const unsigned char *ref_ptr, 1.187 + int recon_stride, 1.188 + unsigned int *sse) 1.189 +{ 1.190 + unsigned int sse0, sse1, var; 1.191 + int sum0, sum1, avg; 1.192 + 1.193 + vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; 1.194 + vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; 1.195 + 1.196 + var = sse0 + sse1; 1.197 + avg = sum0 + sum1; 1.198 + *sse = var; 1.199 + 1.200 + return (var - (((unsigned int)avg * avg) >> 7)); 1.201 + 1.202 +} 1.203 + 1.204 + 1.205 +unsigned int vp8_sub_pixel_variance4x4_mmx 1.206 +( 1.207 + const unsigned char *src_ptr, 1.208 + int src_pixels_per_line, 1.209 + int xoffset, 1.210 + int yoffset, 1.211 + const unsigned char *dst_ptr, 1.212 + int dst_pixels_per_line, 1.213 + unsigned int *sse) 1.214 + 1.215 +{ 1.216 + int xsum; 1.217 + unsigned int xxsum; 1.218 + vp8_filter_block2d_bil4x4_var_mmx( 1.219 + src_ptr, src_pixels_per_line, 1.220 + dst_ptr, dst_pixels_per_line, 1.221 + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], 1.222 + &xsum, &xxsum 1.223 + ); 1.224 + *sse = xxsum; 1.225 + return (xxsum - (((unsigned int)xsum * xsum) >> 4)); 1.226 +} 1.227 + 1.228 + 1.229 +unsigned int vp8_sub_pixel_variance8x8_mmx 1.230 +( 1.231 + const unsigned char *src_ptr, 1.232 + int src_pixels_per_line, 1.233 + int xoffset, 1.234 + int yoffset, 1.235 + const unsigned char *dst_ptr, 1.236 + int dst_pixels_per_line, 1.237 + unsigned int *sse 1.238 +) 1.239 +{ 1.240 + 1.241 + int xsum; 1.242 + unsigned int xxsum; 1.243 + vp8_filter_block2d_bil_var_mmx( 1.244 + src_ptr, src_pixels_per_line, 1.245 + dst_ptr, dst_pixels_per_line, 8, 1.246 + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], 1.247 + &xsum, &xxsum 1.248 + ); 1.249 + *sse = xxsum; 1.250 + return (xxsum - (((unsigned int)xsum * xsum) >> 6)); 1.251 +} 1.252 + 1.253 +unsigned int vp8_sub_pixel_variance16x16_mmx 1.254 +( 1.255 + const unsigned char *src_ptr, 1.256 + int src_pixels_per_line, 1.257 + int xoffset, 1.258 + int yoffset, 1.259 + const unsigned char *dst_ptr, 1.260 + int dst_pixels_per_line, 1.261 + unsigned int *sse 1.262 +) 1.263 +{ 1.264 + 1.265 + int xsum0, xsum1; 1.266 + unsigned int xxsum0, xxsum1; 1.267 + 1.268 + 1.269 + vp8_filter_block2d_bil_var_mmx( 1.270 + src_ptr, src_pixels_per_line, 1.271 + dst_ptr, dst_pixels_per_line, 16, 1.272 + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], 1.273 + &xsum0, &xxsum0 1.274 + ); 1.275 + 1.276 + 1.277 + vp8_filter_block2d_bil_var_mmx( 1.278 + src_ptr + 8, src_pixels_per_line, 1.279 + dst_ptr + 8, dst_pixels_per_line, 16, 1.280 + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], 1.281 + &xsum1, &xxsum1 1.282 + ); 1.283 + 1.284 + xsum0 += xsum1; 1.285 + xxsum0 += xxsum1; 1.286 + 1.287 + *sse = xxsum0; 1.288 + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); 1.289 + 1.290 + 1.291 +} 1.292 + 1.293 +unsigned int vp8_sub_pixel_mse16x16_mmx( 1.294 + const unsigned char *src_ptr, 1.295 + int src_pixels_per_line, 1.296 + int xoffset, 1.297 + int yoffset, 1.298 + const unsigned char *dst_ptr, 1.299 + int dst_pixels_per_line, 1.300 + unsigned int *sse 1.301 +) 1.302 +{ 1.303 + vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); 1.304 + return *sse; 1.305 +} 1.306 + 1.307 +unsigned int vp8_sub_pixel_variance16x8_mmx 1.308 +( 1.309 + const unsigned char *src_ptr, 1.310 + int src_pixels_per_line, 1.311 + int xoffset, 1.312 + int yoffset, 1.313 + const unsigned char *dst_ptr, 1.314 + int dst_pixels_per_line, 1.315 + unsigned int *sse 1.316 +) 1.317 +{ 1.318 + int xsum0, xsum1; 1.319 + unsigned int xxsum0, xxsum1; 1.320 + 1.321 + 1.322 + vp8_filter_block2d_bil_var_mmx( 1.323 + src_ptr, src_pixels_per_line, 1.324 + dst_ptr, dst_pixels_per_line, 8, 1.325 + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], 1.326 + &xsum0, &xxsum0 1.327 + ); 1.328 + 1.329 + 1.330 + vp8_filter_block2d_bil_var_mmx( 1.331 + src_ptr + 8, src_pixels_per_line, 1.332 + dst_ptr + 8, dst_pixels_per_line, 8, 1.333 + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], 1.334 + &xsum1, &xxsum1 1.335 + ); 1.336 + 1.337 + xsum0 += xsum1; 1.338 + xxsum0 += xxsum1; 1.339 + 1.340 + *sse = xxsum0; 1.341 + return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); 1.342 +} 1.343 + 1.344 +unsigned int vp8_sub_pixel_variance8x16_mmx 1.345 +( 1.346 + const unsigned char *src_ptr, 1.347 + int src_pixels_per_line, 1.348 + int xoffset, 1.349 + int yoffset, 1.350 + const unsigned char *dst_ptr, 1.351 + int dst_pixels_per_line, 1.352 + unsigned int *sse 1.353 +) 1.354 +{ 1.355 + int xsum; 1.356 + unsigned int xxsum; 1.357 + vp8_filter_block2d_bil_var_mmx( 1.358 + src_ptr, src_pixels_per_line, 1.359 + dst_ptr, dst_pixels_per_line, 16, 1.360 + vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], 1.361 + &xsum, &xxsum 1.362 + ); 1.363 + *sse = xxsum; 1.364 + return (xxsum - (((unsigned int)xsum * xsum) >> 7)); 1.365 +} 1.366 + 1.367 + 1.368 +unsigned int vp8_variance_halfpixvar16x16_h_mmx( 1.369 + const unsigned char *src_ptr, 1.370 + int source_stride, 1.371 + const unsigned char *ref_ptr, 1.372 + int recon_stride, 1.373 + unsigned int *sse) 1.374 +{ 1.375 + return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0, 1.376 + ref_ptr, recon_stride, sse); 1.377 +} 1.378 + 1.379 + 1.380 +unsigned int vp8_variance_halfpixvar16x16_v_mmx( 1.381 + const unsigned char *src_ptr, 1.382 + int source_stride, 1.383 + const unsigned char *ref_ptr, 1.384 + int recon_stride, 1.385 + unsigned int *sse) 1.386 +{ 1.387 + return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4, 1.388 + ref_ptr, recon_stride, sse); 1.389 +} 1.390 + 1.391 + 1.392 +unsigned int vp8_variance_halfpixvar16x16_hv_mmx( 1.393 + const unsigned char *src_ptr, 1.394 + int source_stride, 1.395 + const unsigned char *ref_ptr, 1.396 + int recon_stride, 1.397 + unsigned int *sse) 1.398 +{ 1.399 + return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4, 1.400 + ref_ptr, recon_stride, sse); 1.401 +}