media/libvpx/vp9/encoder/x86/vp9_variance_sse2.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_variance_sse2.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,555 @@
     1.4 +/*
     1.5 + *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 + *
     1.7 + *  Use of this source code is governed by a BSD-style license
     1.8 + *  that can be found in the LICENSE file in the root of the source
     1.9 + *  tree. An additional intellectual property rights grant can be found
    1.10 + *  in the file PATENTS.  All contributing project authors may
    1.11 + *  be found in the AUTHORS file in the root of the source tree.
    1.12 + */
    1.13 +
    1.14 +#include "./vpx_config.h"
    1.15 +
    1.16 +#include "vp9/encoder/vp9_variance.h"
    1.17 +#include "vp9/common/vp9_pragmas.h"
    1.18 +#include "vpx_ports/mem.h"
    1.19 +
    1.20 +extern unsigned int vp9_get4x4var_mmx
    1.21 +(
    1.22 +  const unsigned char *src_ptr,
    1.23 +  int  source_stride,
    1.24 +  const unsigned char *ref_ptr,
    1.25 +  int  recon_stride,
    1.26 +  unsigned int *SSE,
    1.27 +  int *Sum
    1.28 +);
    1.29 +
    1.30 +unsigned int vp9_get_mb_ss_sse2
    1.31 +(
    1.32 +  const int16_t *src_ptr
    1.33 +);
    1.34 +unsigned int vp9_get16x16var_sse2
    1.35 +(
    1.36 +  const unsigned char *src_ptr,
    1.37 +  int source_stride,
    1.38 +  const unsigned char *ref_ptr,
    1.39 +  int recon_stride,
    1.40 +  unsigned int *SSE,
    1.41 +  int *Sum
    1.42 +);
    1.43 +unsigned int vp9_get8x8var_sse2
    1.44 +(
    1.45 +  const unsigned char *src_ptr,
    1.46 +  int source_stride,
    1.47 +  const unsigned char *ref_ptr,
    1.48 +  int recon_stride,
    1.49 +  unsigned int *SSE,
    1.50 +  int *Sum
    1.51 +);
    1.52 +void vp9_half_horiz_vert_variance8x_h_sse2
    1.53 +(
    1.54 +  const unsigned char *ref_ptr,
    1.55 +  int ref_pixels_per_line,
    1.56 +  const unsigned char *src_ptr,
    1.57 +  int src_pixels_per_line,
    1.58 +  unsigned int Height,
    1.59 +  int *sum,
    1.60 +  unsigned int *sumsquared
    1.61 +);
    1.62 +void vp9_half_horiz_vert_variance16x_h_sse2
    1.63 +(
    1.64 +  const unsigned char *ref_ptr,
    1.65 +  int ref_pixels_per_line,
    1.66 +  const unsigned char *src_ptr,
    1.67 +  int src_pixels_per_line,
    1.68 +  unsigned int Height,
    1.69 +  int *sum,
    1.70 +  unsigned int *sumsquared
    1.71 +);
    1.72 +void vp9_half_horiz_variance8x_h_sse2
    1.73 +(
    1.74 +  const unsigned char *ref_ptr,
    1.75 +  int ref_pixels_per_line,
    1.76 +  const unsigned char *src_ptr,
    1.77 +  int src_pixels_per_line,
    1.78 +  unsigned int Height,
    1.79 +  int *sum,
    1.80 +  unsigned int *sumsquared
    1.81 +);
    1.82 +void vp9_half_horiz_variance16x_h_sse2
    1.83 +(
    1.84 +  const unsigned char *ref_ptr,
    1.85 +  int ref_pixels_per_line,
    1.86 +  const unsigned char *src_ptr,
    1.87 +  int src_pixels_per_line,
    1.88 +  unsigned int Height,
    1.89 +  int *sum,
    1.90 +  unsigned int *sumsquared
    1.91 +);
    1.92 +void vp9_half_vert_variance8x_h_sse2
    1.93 +(
    1.94 +  const unsigned char *ref_ptr,
    1.95 +  int ref_pixels_per_line,
    1.96 +  const unsigned char *src_ptr,
    1.97 +  int src_pixels_per_line,
    1.98 +  unsigned int Height,
    1.99 +  int *sum,
   1.100 +  unsigned int *sumsquared
   1.101 +);
   1.102 +void vp9_half_vert_variance16x_h_sse2
   1.103 +(
   1.104 +  const unsigned char *ref_ptr,
   1.105 +  int ref_pixels_per_line,
   1.106 +  const unsigned char *src_ptr,
   1.107 +  int src_pixels_per_line,
   1.108 +  unsigned int Height,
   1.109 +  int *sum,
   1.110 +  unsigned int *sumsquared
   1.111 +);
   1.112 +
   1.113 +typedef unsigned int (*get_var_sse2) (
   1.114 +  const unsigned char *src_ptr,
   1.115 +  int source_stride,
   1.116 +  const unsigned char *ref_ptr,
   1.117 +  int recon_stride,
   1.118 +  unsigned int *SSE,
   1.119 +  int *Sum
   1.120 +);
   1.121 +
   1.122 +static void variance_sse2(const unsigned char *src_ptr, int  source_stride,
   1.123 +                        const unsigned char *ref_ptr, int  recon_stride,
   1.124 +                        int  w, int  h, unsigned int *sse, int *sum,
   1.125 +                        get_var_sse2 var_fn, int block_size) {
   1.126 +  unsigned int sse0;
   1.127 +  int sum0;
   1.128 +  int i, j;
   1.129 +
   1.130 +  *sse = 0;
   1.131 +  *sum = 0;
   1.132 +
   1.133 +  for (i = 0; i < h; i += block_size) {
   1.134 +    for (j = 0; j < w; j += block_size) {
   1.135 +      var_fn(src_ptr + source_stride * i + j, source_stride,
   1.136 +             ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
   1.137 +      *sse += sse0;
   1.138 +      *sum += sum0;
   1.139 +    }
   1.140 +  }
   1.141 +}
   1.142 +
   1.143 +unsigned int vp9_variance4x4_sse2(
   1.144 +  const unsigned char *src_ptr,
   1.145 +  int  source_stride,
   1.146 +  const unsigned char *ref_ptr,
   1.147 +  int  recon_stride,
   1.148 +  unsigned int *sse) {
   1.149 +  unsigned int var;
   1.150 +  int avg;
   1.151 +
   1.152 +  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4,
   1.153 +                  &var, &avg, vp9_get4x4var_mmx, 4);
   1.154 +  *sse = var;
   1.155 +  return (var - (((unsigned int)avg * avg) >> 4));
   1.156 +}
   1.157 +
   1.158 +unsigned int vp9_variance8x4_sse2(const uint8_t *src_ptr,
   1.159 +                                  int  source_stride,
   1.160 +                                  const uint8_t *ref_ptr,
   1.161 +                                  int  recon_stride,
   1.162 +                                  unsigned int *sse) {
   1.163 +  unsigned int var;
   1.164 +  int avg;
   1.165 +
   1.166 +  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4,
   1.167 +                  &var, &avg, vp9_get4x4var_mmx, 4);
   1.168 +  *sse = var;
   1.169 +  return (var - (((unsigned int)avg * avg) >> 5));
   1.170 +}
   1.171 +
   1.172 +unsigned int vp9_variance4x8_sse2(const uint8_t *src_ptr,
   1.173 +                                  int  source_stride,
   1.174 +                                  const uint8_t *ref_ptr,
   1.175 +                                  int  recon_stride,
   1.176 +                                  unsigned int *sse) {
   1.177 +  unsigned int var;
   1.178 +  int avg;
   1.179 +
   1.180 +  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8,
   1.181 +                  &var, &avg, vp9_get4x4var_mmx, 4);
   1.182 +  *sse = var;
   1.183 +  return (var - (((unsigned int)avg * avg) >> 5));
   1.184 +}
   1.185 +
   1.186 +unsigned int vp9_variance8x8_sse2
   1.187 +(
   1.188 +  const unsigned char *src_ptr,
   1.189 +  int  source_stride,
   1.190 +  const unsigned char *ref_ptr,
   1.191 +  int  recon_stride,
   1.192 +  unsigned int *sse) {
   1.193 +  unsigned int var;
   1.194 +  int avg;
   1.195 +
   1.196 +  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8,
   1.197 +                  &var, &avg, vp9_get8x8var_sse2, 8);
   1.198 +  *sse = var;
   1.199 +  return (var - (((unsigned int)avg * avg) >> 6));
   1.200 +}
   1.201 +
   1.202 +unsigned int vp9_variance16x8_sse2
   1.203 +(
   1.204 +  const unsigned char *src_ptr,
   1.205 +  int  source_stride,
   1.206 +  const unsigned char *ref_ptr,
   1.207 +  int  recon_stride,
   1.208 +  unsigned int *sse) {
   1.209 +  unsigned int var;
   1.210 +  int avg;
   1.211 +
   1.212 +  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8,
   1.213 +                  &var, &avg, vp9_get8x8var_sse2, 8);
   1.214 +  *sse = var;
   1.215 +  return (var - (((unsigned int)avg * avg) >> 7));
   1.216 +}
   1.217 +
   1.218 +unsigned int vp9_variance8x16_sse2
   1.219 +(
   1.220 +  const unsigned char *src_ptr,
   1.221 +  int  source_stride,
   1.222 +  const unsigned char *ref_ptr,
   1.223 +  int  recon_stride,
   1.224 +  unsigned int *sse) {
   1.225 +  unsigned int var;
   1.226 +  int avg;
   1.227 +
   1.228 +  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16,
   1.229 +                &var, &avg, vp9_get8x8var_sse2, 8);
   1.230 +  *sse = var;
   1.231 +  return (var - (((unsigned int)avg * avg) >> 7));
   1.232 +}
   1.233 +
   1.234 +unsigned int vp9_variance16x16_sse2
   1.235 +(
   1.236 +  const unsigned char *src_ptr,
   1.237 +  int  source_stride,
   1.238 +  const unsigned char *ref_ptr,
   1.239 +  int  recon_stride,
   1.240 +  unsigned int *sse) {
   1.241 +  unsigned int var;
   1.242 +  int avg;
   1.243 +
   1.244 +  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
   1.245 +                &var, &avg, vp9_get16x16var_sse2, 16);
   1.246 +  *sse = var;
   1.247 +  return (var - (((unsigned int)avg * avg) >> 8));
   1.248 +}
   1.249 +
   1.250 +unsigned int vp9_mse16x16_sse2(
   1.251 +  const unsigned char *src_ptr,
   1.252 +  int  source_stride,
   1.253 +  const unsigned char *ref_ptr,
   1.254 +  int  recon_stride,
   1.255 +  unsigned int *sse) {
   1.256 +  unsigned int sse0;
   1.257 +  int sum0;
   1.258 +  vp9_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
   1.259 +                       &sum0);
   1.260 +  *sse = sse0;
   1.261 +  return sse0;
   1.262 +}
   1.263 +
   1.264 +unsigned int vp9_variance32x32_sse2(const uint8_t *src_ptr,
   1.265 +                                    int  source_stride,
   1.266 +                                    const uint8_t *ref_ptr,
   1.267 +                                    int  recon_stride,
   1.268 +                                    unsigned int *sse) {
   1.269 +  unsigned int var;
   1.270 +  int avg;
   1.271 +
   1.272 +  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
   1.273 +                &var, &avg, vp9_get16x16var_sse2, 16);
   1.274 +  *sse = var;
   1.275 +  return (var - (((int64_t)avg * avg) >> 10));
   1.276 +}
   1.277 +
   1.278 +unsigned int vp9_variance32x16_sse2(const uint8_t *src_ptr,
   1.279 +                                    int  source_stride,
   1.280 +                                    const uint8_t *ref_ptr,
   1.281 +                                    int  recon_stride,
   1.282 +                                    unsigned int *sse) {
   1.283 +  unsigned int var;
   1.284 +  int avg;
   1.285 +
   1.286 +  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
   1.287 +                &var, &avg, vp9_get16x16var_sse2, 16);
   1.288 +  *sse = var;
   1.289 +  return (var - (((int64_t)avg * avg) >> 9));
   1.290 +}
   1.291 +
   1.292 +unsigned int vp9_variance16x32_sse2(const uint8_t *src_ptr,
   1.293 +                                    int  source_stride,
   1.294 +                                    const uint8_t *ref_ptr,
   1.295 +                                    int  recon_stride,
   1.296 +                                    unsigned int *sse) {
   1.297 +  unsigned int var;
   1.298 +  int avg;
   1.299 +
   1.300 +  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32,
   1.301 +                &var, &avg, vp9_get16x16var_sse2, 16);
   1.302 +  *sse = var;
   1.303 +  return (var - (((int64_t)avg * avg) >> 9));
   1.304 +}
   1.305 +
   1.306 +unsigned int vp9_variance64x64_sse2(const uint8_t *src_ptr,
   1.307 +                                    int  source_stride,
   1.308 +                                    const uint8_t *ref_ptr,
   1.309 +                                    int  recon_stride,
   1.310 +                                    unsigned int *sse) {
   1.311 +  unsigned int var;
   1.312 +  int avg;
   1.313 +
   1.314 +  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
   1.315 +                &var, &avg, vp9_get16x16var_sse2, 16);
   1.316 +  *sse = var;
   1.317 +  return (var - (((int64_t)avg * avg) >> 12));
   1.318 +}
   1.319 +
   1.320 +unsigned int vp9_variance64x32_sse2(const uint8_t *src_ptr,
   1.321 +                                    int  source_stride,
   1.322 +                                    const uint8_t *ref_ptr,
   1.323 +                                    int  recon_stride,
   1.324 +                                    unsigned int *sse) {
   1.325 +  unsigned int var;
   1.326 +  int avg;
   1.327 +
   1.328 +  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
   1.329 +                &var, &avg, vp9_get16x16var_sse2, 16);
   1.330 +  *sse = var;
   1.331 +  return (var - (((int64_t)avg * avg) >> 11));
   1.332 +}
   1.333 +
   1.334 +unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
   1.335 +                                    int  source_stride,
   1.336 +                                    const uint8_t *ref_ptr,
   1.337 +                                    int  recon_stride,
   1.338 +                                    unsigned int *sse) {
   1.339 +  unsigned int var;
   1.340 +  int avg;
   1.341 +
   1.342 +  variance_sse2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64,
   1.343 +                &var, &avg, vp9_get16x16var_sse2, 16);
   1.344 +  *sse = var;
   1.345 +  return (var - (((int64_t)avg * avg) >> 11));
   1.346 +}
   1.347 +
   1.348 +#define DECL(w, opt) \
   1.349 +int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
   1.350 +                                        ptrdiff_t src_stride, \
   1.351 +                                        int x_offset, int y_offset, \
   1.352 +                                        const uint8_t *dst, \
   1.353 +                                        ptrdiff_t dst_stride, \
   1.354 +                                        int height, unsigned int *sse)
   1.355 +#define DECLS(opt1, opt2) \
   1.356 +DECL(4, opt2); \
   1.357 +DECL(8, opt1); \
   1.358 +DECL(16, opt1)
   1.359 +
   1.360 +DECLS(sse2, sse);
   1.361 +DECLS(ssse3, ssse3);
   1.362 +#undef DECLS
   1.363 +#undef DECL
   1.364 +
   1.365 +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
   1.366 +unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
   1.367 +                                                     int src_stride, \
   1.368 +                                                     int x_offset, \
   1.369 +                                                     int y_offset, \
   1.370 +                                                     const uint8_t *dst, \
   1.371 +                                                     int dst_stride, \
   1.372 +                                                     unsigned int *sse_ptr) { \
   1.373 +  unsigned int sse; \
   1.374 +  int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
   1.375 +                                                y_offset, dst, dst_stride, \
   1.376 +                                                h, &sse); \
   1.377 +  if (w > wf) { \
   1.378 +    unsigned int sse2; \
   1.379 +    int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
   1.380 +                                                   x_offset, y_offset, \
   1.381 +                                                   dst + 16, dst_stride, \
   1.382 +                                                   h, &sse2); \
   1.383 +    se += se2; \
   1.384 +    sse += sse2; \
   1.385 +    if (w > wf * 2) { \
   1.386 +      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
   1.387 +                                                 x_offset, y_offset, \
   1.388 +                                                 dst + 32, dst_stride, \
   1.389 +                                                 h, &sse2); \
   1.390 +      se += se2; \
   1.391 +      sse += sse2; \
   1.392 +      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
   1.393 +                                                 x_offset, y_offset, \
   1.394 +                                                 dst + 48, dst_stride, \
   1.395 +                                                 h, &sse2); \
   1.396 +      se += se2; \
   1.397 +      sse += sse2; \
   1.398 +    } \
   1.399 +  } \
   1.400 +  *sse_ptr = sse; \
   1.401 +  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
   1.402 +}
   1.403 +
   1.404 +#define FNS(opt1, opt2) \
   1.405 +FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
   1.406 +FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
   1.407 +FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
   1.408 +FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
   1.409 +FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
   1.410 +FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
   1.411 +FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
   1.412 +FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
   1.413 +FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
   1.414 +FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
   1.415 +FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
   1.416 +FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
   1.417 +FN(4,   4,  4, 2, 2, opt2, (unsigned int))
   1.418 +
   1.419 +FNS(sse2, sse);
   1.420 +FNS(ssse3, ssse3);
   1.421 +
   1.422 +#undef FNS
   1.423 +#undef FN
   1.424 +
   1.425 +#define DECL(w, opt) \
   1.426 +int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
   1.427 +                                            ptrdiff_t src_stride, \
   1.428 +                                            int x_offset, int y_offset, \
   1.429 +                                            const uint8_t *dst, \
   1.430 +                                            ptrdiff_t dst_stride, \
   1.431 +                                            const uint8_t *sec, \
   1.432 +                                            ptrdiff_t sec_stride, \
   1.433 +                                            int height, unsigned int *sse)
   1.434 +#define DECLS(opt1, opt2) \
   1.435 +DECL(4, opt2); \
   1.436 +DECL(8, opt1); \
   1.437 +DECL(16, opt1)
   1.438 +
   1.439 +DECLS(sse2, sse);
   1.440 +DECLS(ssse3, ssse3);
   1.441 +#undef DECL
   1.442 +#undef DECLS
   1.443 +
   1.444 +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
   1.445 +unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
   1.446 +                                                         int src_stride, \
   1.447 +                                                         int x_offset, \
   1.448 +                                                         int y_offset, \
   1.449 +                                                         const uint8_t *dst, \
   1.450 +                                                         int dst_stride, \
   1.451 +                                                         unsigned int *sseptr, \
   1.452 +                                                         const uint8_t *sec) { \
   1.453 +  unsigned int sse; \
   1.454 +  int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
   1.455 +                                                    y_offset, dst, dst_stride, \
   1.456 +                                                    sec, w, h, &sse); \
   1.457 +  if (w > wf) { \
   1.458 +    unsigned int sse2; \
   1.459 +    int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
   1.460 +                                                       x_offset, y_offset, \
   1.461 +                                                       dst + 16, dst_stride, \
   1.462 +                                                       sec + 16, w, h, &sse2); \
   1.463 +    se += se2; \
   1.464 +    sse += sse2; \
   1.465 +    if (w > wf * 2) { \
   1.466 +      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
   1.467 +                                                     x_offset, y_offset, \
   1.468 +                                                     dst + 32, dst_stride, \
   1.469 +                                                     sec + 32, w, h, &sse2); \
   1.470 +      se += se2; \
   1.471 +      sse += sse2; \
   1.472 +      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
   1.473 +                                                     x_offset, y_offset, \
   1.474 +                                                     dst + 48, dst_stride, \
   1.475 +                                                     sec + 48, w, h, &sse2); \
   1.476 +      se += se2; \
   1.477 +      sse += sse2; \
   1.478 +    } \
   1.479 +  } \
   1.480 +  *sseptr = sse; \
   1.481 +  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
   1.482 +}
   1.483 +
   1.484 +#define FNS(opt1, opt2) \
   1.485 +FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
   1.486 +FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
   1.487 +FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
   1.488 +FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
   1.489 +FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
   1.490 +FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
   1.491 +FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
   1.492 +FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
   1.493 +FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
   1.494 +FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
   1.495 +FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
   1.496 +FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
   1.497 +FN(4,   4,  4, 2, 2, opt2, (unsigned int))
   1.498 +
   1.499 +FNS(sse2, sse);
   1.500 +FNS(ssse3, ssse3);
   1.501 +
   1.502 +#undef FNS
   1.503 +#undef FN
   1.504 +
   1.505 +unsigned int vp9_variance_halfpixvar16x16_h_sse2(
   1.506 +  const unsigned char *src_ptr,
   1.507 +  int  src_pixels_per_line,
   1.508 +  const unsigned char *dst_ptr,
   1.509 +  int  dst_pixels_per_line,
   1.510 +  unsigned int *sse) {
   1.511 +  int xsum0;
   1.512 +  unsigned int xxsum0;
   1.513 +
   1.514 +  vp9_half_horiz_variance16x_h_sse2(
   1.515 +    src_ptr, src_pixels_per_line,
   1.516 +    dst_ptr, dst_pixels_per_line, 16,
   1.517 +    &xsum0, &xxsum0);
   1.518 +
   1.519 +  *sse = xxsum0;
   1.520 +  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
   1.521 +}
   1.522 +
   1.523 +
   1.524 +unsigned int vp9_variance_halfpixvar16x16_v_sse2(
   1.525 +  const unsigned char *src_ptr,
   1.526 +  int  src_pixels_per_line,
   1.527 +  const unsigned char *dst_ptr,
   1.528 +  int  dst_pixels_per_line,
   1.529 +  unsigned int *sse) {
   1.530 +  int xsum0;
   1.531 +  unsigned int xxsum0;
   1.532 +  vp9_half_vert_variance16x_h_sse2(
   1.533 +    src_ptr, src_pixels_per_line,
   1.534 +    dst_ptr, dst_pixels_per_line, 16,
   1.535 +    &xsum0, &xxsum0);
   1.536 +
   1.537 +  *sse = xxsum0;
   1.538 +  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
   1.539 +}
   1.540 +
   1.541 +
   1.542 +unsigned int vp9_variance_halfpixvar16x16_hv_sse2(
   1.543 +  const unsigned char *src_ptr,
   1.544 +  int  src_pixels_per_line,
   1.545 +  const unsigned char *dst_ptr,
   1.546 +  int  dst_pixels_per_line,
   1.547 +  unsigned int *sse) {
   1.548 +  int xsum0;
   1.549 +  unsigned int xxsum0;
   1.550 +
   1.551 +  vp9_half_horiz_vert_variance16x_h_sse2(
   1.552 +    src_ptr, src_pixels_per_line,
   1.553 +    dst_ptr, dst_pixels_per_line, 16,
   1.554 +    &xsum0, &xxsum0);
   1.555 +
   1.556 +  *sse = xxsum0;
   1.557 +  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
   1.558 +}

mercurial