media/libvpx/vp8/common/x86/variance_mmx.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/common/x86/variance_mmx.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,398 @@
     1.4 +/*
     1.5 + *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 + *
     1.7 + *  Use of this source code is governed by a BSD-style license
     1.8 + *  that can be found in the LICENSE file in the root of the source
     1.9 + *  tree. An additional intellectual property rights grant can be found
    1.10 + *  in the file PATENTS.  All contributing project authors may
    1.11 + *  be found in the AUTHORS file in the root of the source tree.
    1.12 + */
    1.13 +
    1.14 +#include "vpx_config.h"
    1.15 +#include "vp8/common/variance.h"
    1.16 +#include "vp8/common/pragmas.h"
    1.17 +#include "vpx_ports/mem.h"
    1.18 +#include "vp8/common/x86/filter_x86.h"
    1.19 +
    1.20 +extern void filter_block1d_h6_mmx
    1.21 +(
    1.22 +    const unsigned char *src_ptr,
    1.23 +    unsigned short *output_ptr,
    1.24 +    unsigned int src_pixels_per_line,
    1.25 +    unsigned int pixel_step,
    1.26 +    unsigned int output_height,
    1.27 +    unsigned int output_width,
    1.28 +    short *filter
    1.29 +);
    1.30 +extern void filter_block1d_v6_mmx
    1.31 +(
    1.32 +    const short *src_ptr,
    1.33 +    unsigned char *output_ptr,
    1.34 +    unsigned int pixels_per_line,
    1.35 +    unsigned int pixel_step,
    1.36 +    unsigned int output_height,
    1.37 +    unsigned int output_width,
    1.38 +    short *filter
    1.39 +);
    1.40 +
    1.41 +extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
    1.42 +extern unsigned int vp8_get8x8var_mmx
    1.43 +(
    1.44 +    const unsigned char *src_ptr,
    1.45 +    int  source_stride,
    1.46 +    const unsigned char *ref_ptr,
    1.47 +    int  recon_stride,
    1.48 +    unsigned int *SSE,
    1.49 +    int *Sum
    1.50 +);
    1.51 +extern unsigned int vp8_get4x4var_mmx
    1.52 +(
    1.53 +    const unsigned char *src_ptr,
    1.54 +    int  source_stride,
    1.55 +    const unsigned char *ref_ptr,
    1.56 +    int  recon_stride,
    1.57 +    unsigned int *SSE,
    1.58 +    int *Sum
    1.59 +);
    1.60 +extern void vp8_filter_block2d_bil4x4_var_mmx
    1.61 +(
    1.62 +    const unsigned char *ref_ptr,
    1.63 +    int ref_pixels_per_line,
    1.64 +    const unsigned char *src_ptr,
    1.65 +    int src_pixels_per_line,
    1.66 +    const short *HFilter,
    1.67 +    const short *VFilter,
    1.68 +    int *sum,
    1.69 +    unsigned int *sumsquared
    1.70 +);
    1.71 +extern void vp8_filter_block2d_bil_var_mmx
    1.72 +(
    1.73 +    const unsigned char *ref_ptr,
    1.74 +    int ref_pixels_per_line,
    1.75 +    const unsigned char *src_ptr,
    1.76 +    int src_pixels_per_line,
    1.77 +    unsigned int Height,
    1.78 +    const short *HFilter,
    1.79 +    const short *VFilter,
    1.80 +    int *sum,
    1.81 +    unsigned int *sumsquared
    1.82 +);
    1.83 +
    1.84 +
    1.85 +unsigned int vp8_variance4x4_mmx(
    1.86 +    const unsigned char *src_ptr,
    1.87 +    int  source_stride,
    1.88 +    const unsigned char *ref_ptr,
    1.89 +    int  recon_stride,
    1.90 +    unsigned int *sse)
    1.91 +{
    1.92 +    unsigned int var;
    1.93 +    int avg;
    1.94 +
    1.95 +    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
    1.96 +    *sse = var;
    1.97 +    return (var - (((unsigned int)avg * avg) >> 4));
    1.98 +
    1.99 +}
   1.100 +
   1.101 +unsigned int vp8_variance8x8_mmx(
   1.102 +    const unsigned char *src_ptr,
   1.103 +    int  source_stride,
   1.104 +    const unsigned char *ref_ptr,
   1.105 +    int  recon_stride,
   1.106 +    unsigned int *sse)
   1.107 +{
   1.108 +    unsigned int var;
   1.109 +    int avg;
   1.110 +
   1.111 +    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
   1.112 +    *sse = var;
   1.113 +
   1.114 +    return (var - (((unsigned int)avg * avg) >> 6));
   1.115 +
   1.116 +}
   1.117 +
   1.118 +unsigned int vp8_mse16x16_mmx(
   1.119 +    const unsigned char *src_ptr,
   1.120 +    int  source_stride,
   1.121 +    const unsigned char *ref_ptr,
   1.122 +    int  recon_stride,
   1.123 +    unsigned int *sse)
   1.124 +{
   1.125 +    unsigned int sse0, sse1, sse2, sse3, var;
   1.126 +    int sum0, sum1, sum2, sum3;
   1.127 +
   1.128 +
   1.129 +    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
   1.130 +    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
   1.131 +    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
   1.132 +    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
   1.133 +
   1.134 +    var = sse0 + sse1 + sse2 + sse3;
   1.135 +    *sse = var;
   1.136 +    return var;
   1.137 +}
   1.138 +
   1.139 +
   1.140 +unsigned int vp8_variance16x16_mmx(
   1.141 +    const unsigned char *src_ptr,
   1.142 +    int  source_stride,
   1.143 +    const unsigned char *ref_ptr,
   1.144 +    int  recon_stride,
   1.145 +    unsigned int *sse)
   1.146 +{
   1.147 +    unsigned int sse0, sse1, sse2, sse3, var;
   1.148 +    int sum0, sum1, sum2, sum3, avg;
   1.149 +
   1.150 +
   1.151 +    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
   1.152 +    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
   1.153 +    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
   1.154 +    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
   1.155 +
   1.156 +    var = sse0 + sse1 + sse2 + sse3;
   1.157 +    avg = sum0 + sum1 + sum2 + sum3;
   1.158 +    *sse = var;
   1.159 +    return (var - (((unsigned int)avg * avg) >> 8));
   1.160 +}
   1.161 +
   1.162 +unsigned int vp8_variance16x8_mmx(
   1.163 +    const unsigned char *src_ptr,
   1.164 +    int  source_stride,
   1.165 +    const unsigned char *ref_ptr,
   1.166 +    int  recon_stride,
   1.167 +    unsigned int *sse)
   1.168 +{
   1.169 +    unsigned int sse0, sse1, var;
   1.170 +    int sum0, sum1, avg;
   1.171 +
   1.172 +    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
   1.173 +    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
   1.174 +
   1.175 +    var = sse0 + sse1;
   1.176 +    avg = sum0 + sum1;
   1.177 +    *sse = var;
   1.178 +    return (var - (((unsigned int)avg * avg) >> 7));
   1.179 +
   1.180 +}
   1.181 +
   1.182 +
   1.183 +unsigned int vp8_variance8x16_mmx(
   1.184 +    const unsigned char *src_ptr,
   1.185 +    int  source_stride,
   1.186 +    const unsigned char *ref_ptr,
   1.187 +    int  recon_stride,
   1.188 +    unsigned int *sse)
   1.189 +{
   1.190 +    unsigned int sse0, sse1, var;
   1.191 +    int sum0, sum1, avg;
   1.192 +
   1.193 +    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
   1.194 +    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
   1.195 +
   1.196 +    var = sse0 + sse1;
   1.197 +    avg = sum0 + sum1;
   1.198 +    *sse = var;
   1.199 +
   1.200 +    return (var - (((unsigned int)avg * avg) >> 7));
   1.201 +
   1.202 +}
   1.203 +
   1.204 +
   1.205 +unsigned int vp8_sub_pixel_variance4x4_mmx
   1.206 +(
   1.207 +    const unsigned char  *src_ptr,
   1.208 +    int  src_pixels_per_line,
   1.209 +    int  xoffset,
   1.210 +    int  yoffset,
   1.211 +    const unsigned char *dst_ptr,
   1.212 +    int dst_pixels_per_line,
   1.213 +    unsigned int *sse)
   1.214 +
   1.215 +{
   1.216 +    int xsum;
   1.217 +    unsigned int xxsum;
   1.218 +    vp8_filter_block2d_bil4x4_var_mmx(
   1.219 +        src_ptr, src_pixels_per_line,
   1.220 +        dst_ptr, dst_pixels_per_line,
   1.221 +        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
   1.222 +        &xsum, &xxsum
   1.223 +    );
   1.224 +    *sse = xxsum;
   1.225 +    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
   1.226 +}
   1.227 +
   1.228 +
   1.229 +unsigned int vp8_sub_pixel_variance8x8_mmx
   1.230 +(
   1.231 +    const unsigned char  *src_ptr,
   1.232 +    int  src_pixels_per_line,
   1.233 +    int  xoffset,
   1.234 +    int  yoffset,
   1.235 +    const unsigned char *dst_ptr,
   1.236 +    int dst_pixels_per_line,
   1.237 +    unsigned int *sse
   1.238 +)
   1.239 +{
   1.240 +
   1.241 +    int xsum;
   1.242 +    unsigned int xxsum;
   1.243 +    vp8_filter_block2d_bil_var_mmx(
   1.244 +        src_ptr, src_pixels_per_line,
   1.245 +        dst_ptr, dst_pixels_per_line, 8,
   1.246 +        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
   1.247 +        &xsum, &xxsum
   1.248 +    );
   1.249 +    *sse = xxsum;
   1.250 +    return (xxsum - (((unsigned int)xsum * xsum) >> 6));
   1.251 +}
   1.252 +
   1.253 +unsigned int vp8_sub_pixel_variance16x16_mmx
   1.254 +(
   1.255 +    const unsigned char  *src_ptr,
   1.256 +    int  src_pixels_per_line,
   1.257 +    int  xoffset,
   1.258 +    int  yoffset,
   1.259 +    const unsigned char *dst_ptr,
   1.260 +    int dst_pixels_per_line,
   1.261 +    unsigned int *sse
   1.262 +)
   1.263 +{
   1.264 +
   1.265 +    int xsum0, xsum1;
   1.266 +    unsigned int xxsum0, xxsum1;
   1.267 +
   1.268 +
   1.269 +    vp8_filter_block2d_bil_var_mmx(
   1.270 +        src_ptr, src_pixels_per_line,
   1.271 +        dst_ptr, dst_pixels_per_line, 16,
   1.272 +        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
   1.273 +        &xsum0, &xxsum0
   1.274 +    );
   1.275 +
   1.276 +
   1.277 +    vp8_filter_block2d_bil_var_mmx(
   1.278 +        src_ptr + 8, src_pixels_per_line,
   1.279 +        dst_ptr + 8, dst_pixels_per_line, 16,
   1.280 +        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
   1.281 +        &xsum1, &xxsum1
   1.282 +    );
   1.283 +
   1.284 +    xsum0 += xsum1;
   1.285 +    xxsum0 += xxsum1;
   1.286 +
   1.287 +    *sse = xxsum0;
   1.288 +    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
   1.289 +
   1.290 +
   1.291 +}
   1.292 +
   1.293 +unsigned int vp8_sub_pixel_mse16x16_mmx(
   1.294 +    const unsigned char  *src_ptr,
   1.295 +    int  src_pixels_per_line,
   1.296 +    int  xoffset,
   1.297 +    int  yoffset,
   1.298 +    const unsigned char *dst_ptr,
   1.299 +    int dst_pixels_per_line,
   1.300 +    unsigned int *sse
   1.301 +)
   1.302 +{
   1.303 +    vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
   1.304 +    return *sse;
   1.305 +}
   1.306 +
   1.307 +unsigned int vp8_sub_pixel_variance16x8_mmx
   1.308 +(
   1.309 +    const unsigned char  *src_ptr,
   1.310 +    int  src_pixels_per_line,
   1.311 +    int  xoffset,
   1.312 +    int  yoffset,
   1.313 +    const unsigned char *dst_ptr,
   1.314 +    int dst_pixels_per_line,
   1.315 +    unsigned int *sse
   1.316 +)
   1.317 +{
   1.318 +    int xsum0, xsum1;
   1.319 +    unsigned int xxsum0, xxsum1;
   1.320 +
   1.321 +
   1.322 +    vp8_filter_block2d_bil_var_mmx(
   1.323 +        src_ptr, src_pixels_per_line,
   1.324 +        dst_ptr, dst_pixels_per_line, 8,
   1.325 +        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
   1.326 +        &xsum0, &xxsum0
   1.327 +    );
   1.328 +
   1.329 +
   1.330 +    vp8_filter_block2d_bil_var_mmx(
   1.331 +        src_ptr + 8, src_pixels_per_line,
   1.332 +        dst_ptr + 8, dst_pixels_per_line, 8,
   1.333 +        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
   1.334 +        &xsum1, &xxsum1
   1.335 +    );
   1.336 +
   1.337 +    xsum0 += xsum1;
   1.338 +    xxsum0 += xxsum1;
   1.339 +
   1.340 +    *sse = xxsum0;
   1.341 +    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
   1.342 +}
   1.343 +
   1.344 +unsigned int vp8_sub_pixel_variance8x16_mmx
   1.345 +(
   1.346 +    const unsigned char  *src_ptr,
   1.347 +    int  src_pixels_per_line,
   1.348 +    int  xoffset,
   1.349 +    int  yoffset,
   1.350 +    const unsigned char *dst_ptr,
   1.351 +    int dst_pixels_per_line,
   1.352 +    unsigned int *sse
   1.353 +)
   1.354 +{
   1.355 +    int xsum;
   1.356 +    unsigned int xxsum;
   1.357 +    vp8_filter_block2d_bil_var_mmx(
   1.358 +        src_ptr, src_pixels_per_line,
   1.359 +        dst_ptr, dst_pixels_per_line, 16,
   1.360 +        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
   1.361 +        &xsum, &xxsum
   1.362 +    );
   1.363 +    *sse = xxsum;
   1.364 +    return (xxsum - (((unsigned int)xsum * xsum) >> 7));
   1.365 +}
   1.366 +
   1.367 +
   1.368 +unsigned int vp8_variance_halfpixvar16x16_h_mmx(
   1.369 +    const unsigned char *src_ptr,
   1.370 +    int  source_stride,
   1.371 +    const unsigned char *ref_ptr,
   1.372 +    int  recon_stride,
   1.373 +    unsigned int *sse)
   1.374 +{
   1.375 +    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
   1.376 +                                           ref_ptr, recon_stride, sse);
   1.377 +}
   1.378 +
   1.379 +
   1.380 +unsigned int vp8_variance_halfpixvar16x16_v_mmx(
   1.381 +    const unsigned char *src_ptr,
   1.382 +    int  source_stride,
   1.383 +    const unsigned char *ref_ptr,
   1.384 +    int  recon_stride,
   1.385 +    unsigned int *sse)
   1.386 +{
   1.387 +    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
   1.388 +                                           ref_ptr, recon_stride, sse);
   1.389 +}
   1.390 +
   1.391 +
   1.392 +unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
   1.393 +    const unsigned char *src_ptr,
   1.394 +    int  source_stride,
   1.395 +    const unsigned char *ref_ptr,
   1.396 +    int  recon_stride,
   1.397 +    unsigned int *sse)
   1.398 +{
   1.399 +    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
   1.400 +                                           ref_ptr, recon_stride, sse);
   1.401 +}

mercurial