media/libvpx/vp9/encoder/vp9_variance_c.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp9/encoder/vp9_variance_c.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1094 @@
     1.4 +/*
     1.5 + *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 + *
     1.7 + *  Use of this source code is governed by a BSD-style license
     1.8 + *  that can be found in the LICENSE file in the root of the source
     1.9 + *  tree. An additional intellectual property rights grant can be found
    1.10 + *  in the file PATENTS.  All contributing project authors may
    1.11 + *  be found in the AUTHORS file in the root of the source tree.
    1.12 + */
    1.13 +
    1.14 +#include "./vp9_rtcd.h"
    1.15 +
    1.16 +#include "vpx_ports/mem.h"
    1.17 +#include "vpx/vpx_integer.h"
    1.18 +
    1.19 +#include "vp9/common/vp9_common.h"
    1.20 +#include "vp9/common/vp9_filter.h"
    1.21 +
    1.22 +#include "vp9/encoder/vp9_variance.h"
    1.23 +
    1.24 +void variance(const uint8_t *src_ptr,
    1.25 +              int  source_stride,
    1.26 +              const uint8_t *ref_ptr,
    1.27 +              int  recon_stride,
    1.28 +              int  w,
    1.29 +              int  h,
    1.30 +              unsigned int *sse,
    1.31 +              int *sum) {
    1.32 +  int i, j;
    1.33 +  int diff;
    1.34 +
    1.35 +  *sum = 0;
    1.36 +  *sse = 0;
    1.37 +
    1.38 +  for (i = 0; i < h; i++) {
    1.39 +    for (j = 0; j < w; j++) {
    1.40 +      diff = src_ptr[j] - ref_ptr[j];
    1.41 +      *sum += diff;
    1.42 +      *sse += diff * diff;
    1.43 +    }
    1.44 +
    1.45 +    src_ptr += source_stride;
    1.46 +    ref_ptr += recon_stride;
    1.47 +  }
    1.48 +}
    1.49 +
    1.50 +/****************************************************************************
    1.51 + *
    1.52 + *  ROUTINE       : filter_block2d_bil_first_pass
    1.53 + *
    1.54 + *  INPUTS        : uint8_t  *src_ptr          : Pointer to source block.
    1.55 + *                  uint32_t src_pixels_per_line : Stride of input block.
    1.56 + *                  uint32_t pixel_step        : Offset between filter input
    1.57 + *                                               samples (see notes).
    1.58 + *                  uint32_t output_height     : Input block height.
    1.59 + *                  uint32_t output_width      : Input block width.
    1.60 + *                  int32_t  *vp9_filter       : Array of 2 bi-linear filter
    1.61 + *                                               taps.
    1.62 + *
    1.63 + *  OUTPUTS       : int32_t *output_ptr        : Pointer to filtered block.
    1.64 + *
    1.65 + *  RETURNS       : void
    1.66 + *
    1.67 + *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
    1.68 + *                  either horizontal or vertical direction to produce the
    1.69 + *                  filtered output block. Used to implement first-pass
    1.70 + *                  of 2-D separable filter.
    1.71 + *
    1.72 + *  SPECIAL NOTES : Produces int32_t output to retain precision for next pass.
    1.73 + *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
    1.74 + *                  pixel_step defines whether the filter is applied
    1.75 + *                  horizontally (pixel_step=1) or vertically (pixel_step=
    1.76 + *                  stride).
    1.77 + *                  It defines the offset required to move from one input
    1.78 + *                  to the next.
    1.79 + *
    1.80 + ****************************************************************************/
    1.81 +static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
    1.82 +                                              uint16_t *output_ptr,
    1.83 +                                              unsigned int src_pixels_per_line,
    1.84 +                                              int pixel_step,
    1.85 +                                              unsigned int output_height,
    1.86 +                                              unsigned int output_width,
    1.87 +                                              const int16_t *vp9_filter) {
    1.88 +  unsigned int i, j;
    1.89 +
    1.90 +  for (i = 0; i < output_height; i++) {
    1.91 +    for (j = 0; j < output_width; j++) {
    1.92 +      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
    1.93 +                          (int)src_ptr[pixel_step] * vp9_filter[1],
    1.94 +                          FILTER_BITS);
    1.95 +
    1.96 +      src_ptr++;
    1.97 +    }
    1.98 +
    1.99 +    // Next row...
   1.100 +    src_ptr    += src_pixels_per_line - output_width;
   1.101 +    output_ptr += output_width;
   1.102 +  }
   1.103 +}
   1.104 +
   1.105 +/****************************************************************************
   1.106 + *
   1.107 + *  ROUTINE       : filter_block2d_bil_second_pass
   1.108 + *
   1.109 + *  INPUTS        : int32_t  *src_ptr          : Pointer to source block.
   1.110 + *                  uint32_t src_pixels_per_line : Stride of input block.
   1.111 + *                  uint32_t pixel_step        : Offset between filter input
   1.112 + *                                               samples (see notes).
   1.113 + *                  uint32_t output_height     : Input block height.
   1.114 + *                  uint32_t output_width      : Input block width.
   1.115 + *                  int32_t  *vp9_filter       : Array of 2 bi-linear filter
   1.116 + *                                               taps.
   1.117 + *
   1.118 + *  OUTPUTS       : uint16_t *output_ptr       : Pointer to filtered block.
   1.119 + *
   1.120 + *  RETURNS       : void
   1.121 + *
   1.122 + *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
   1.123 + *                  either horizontal or vertical direction to produce the
   1.124 + *                  filtered output block. Used to implement second-pass
   1.125 + *                  of 2-D separable filter.
   1.126 + *
   1.127 + *  SPECIAL NOTES : Requires 32-bit input as produced by
   1.128 + *                  filter_block2d_bil_first_pass.
   1.129 + *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
   1.130 + *                  pixel_step defines whether the filter is applied
   1.131 + *                  horizontally (pixel_step=1) or vertically (pixel_step=
   1.132 + *                  stride).
   1.133 + *                  It defines the offset required to move from one input
   1.134 + *                  to the next.
   1.135 + *
   1.136 + ****************************************************************************/
   1.137 +static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
   1.138 +                                               uint8_t *output_ptr,
   1.139 +                                               unsigned int src_pixels_per_line,
   1.140 +                                               unsigned int pixel_step,
   1.141 +                                               unsigned int output_height,
   1.142 +                                               unsigned int output_width,
   1.143 +                                               const int16_t *vp9_filter) {
   1.144 +  unsigned int  i, j;
   1.145 +
   1.146 +  for (i = 0; i < output_height; i++) {
   1.147 +    for (j = 0; j < output_width; j++) {
   1.148 +      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
   1.149 +                          (int)src_ptr[pixel_step] * vp9_filter[1],
   1.150 +                          FILTER_BITS);
   1.151 +      src_ptr++;
   1.152 +    }
   1.153 +
   1.154 +    src_ptr += src_pixels_per_line - output_width;
   1.155 +    output_ptr += output_width;
   1.156 +  }
   1.157 +}
   1.158 +
   1.159 +unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
   1.160 +  unsigned int i, sum = 0;
   1.161 +
   1.162 +  for (i = 0; i < 256; i++) {
   1.163 +    sum += (src_ptr[i] * src_ptr[i]);
   1.164 +  }
   1.165 +
   1.166 +  return sum;
   1.167 +}
   1.168 +
   1.169 +unsigned int vp9_variance64x32_c(const uint8_t *src_ptr,
   1.170 +                                 int  source_stride,
   1.171 +                                 const uint8_t *ref_ptr,
   1.172 +                                 int  recon_stride,
   1.173 +                                 unsigned int *sse) {
   1.174 +  unsigned int var;
   1.175 +  int avg;
   1.176 +
   1.177 +  variance(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32, &var, &avg);
   1.178 +  *sse = var;
   1.179 +  return (var - (((int64_t)avg * avg) >> 11));
   1.180 +}
   1.181 +
   1.182 +unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr,
   1.183 +                                           int  src_pixels_per_line,
   1.184 +                                           int  xoffset,
   1.185 +                                           int  yoffset,
   1.186 +                                           const uint8_t *dst_ptr,
   1.187 +                                           int dst_pixels_per_line,
   1.188 +                                           unsigned int *sse) {
   1.189 +  uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
   1.190 +  uint8_t temp2[68 * 64];
   1.191 +  const int16_t *hfilter, *vfilter;
   1.192 +
   1.193 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.194 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.195 +
   1.196 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.197 +                                    1, 33, 64, hfilter);
   1.198 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);
   1.199 +
   1.200 +  return vp9_variance64x32(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
   1.201 +}
   1.202 +
   1.203 +unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr,
   1.204 +                                               int  src_pixels_per_line,
   1.205 +                                               int  xoffset,
   1.206 +                                               int  yoffset,
   1.207 +                                               const uint8_t *dst_ptr,
   1.208 +                                               int dst_pixels_per_line,
   1.209 +                                               unsigned int *sse,
   1.210 +                                               const uint8_t *second_pred) {
   1.211 +  uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
   1.212 +  uint8_t temp2[68 * 64];
   1.213 +  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
   1.214 +  const int16_t *hfilter, *vfilter;
   1.215 +
   1.216 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.217 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.218 +
   1.219 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.220 +                                    1, 33, 64, hfilter);
   1.221 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);
   1.222 +  comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64);
   1.223 +  return vp9_variance64x32(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
   1.224 +}
   1.225 +
   1.226 +unsigned int vp9_variance32x64_c(const uint8_t *src_ptr,
   1.227 +                                 int  source_stride,
   1.228 +                                 const uint8_t *ref_ptr,
   1.229 +                                 int  recon_stride,
   1.230 +                                 unsigned int *sse) {
   1.231 +  unsigned int var;
   1.232 +  int avg;
   1.233 +
   1.234 +  variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64, &var, &avg);
   1.235 +  *sse = var;
   1.236 +  return (var - (((int64_t)avg * avg) >> 11));
   1.237 +}
   1.238 +
   1.239 +unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr,
   1.240 +                                           int  src_pixels_per_line,
   1.241 +                                           int  xoffset,
   1.242 +                                           int  yoffset,
   1.243 +                                           const uint8_t *dst_ptr,
   1.244 +                                           int dst_pixels_per_line,
   1.245 +                                           unsigned int *sse) {
   1.246 +  uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
   1.247 +  uint8_t temp2[68 * 64];
   1.248 +  const int16_t *hfilter, *vfilter;
   1.249 +
   1.250 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.251 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.252 +
   1.253 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.254 +                                    1, 65, 32, hfilter);
   1.255 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);
   1.256 +
   1.257 +  return vp9_variance32x64(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
   1.258 +}
   1.259 +
   1.260 +unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr,
   1.261 +                                               int  src_pixels_per_line,
   1.262 +                                               int  xoffset,
   1.263 +                                               int  yoffset,
   1.264 +                                               const uint8_t *dst_ptr,
   1.265 +                                               int dst_pixels_per_line,
   1.266 +                                               unsigned int *sse,
   1.267 +                                               const uint8_t *second_pred) {
   1.268 +  uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
   1.269 +  uint8_t temp2[68 * 64];
   1.270 +  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64);  // compound pred buffer
   1.271 +  const int16_t *hfilter, *vfilter;
   1.272 +
   1.273 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.274 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.275 +
   1.276 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.277 +                                    1, 65, 32, hfilter);
   1.278 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);
   1.279 +  comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32);
   1.280 +  return vp9_variance32x64(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
   1.281 +}
   1.282 +
   1.283 +unsigned int vp9_variance32x16_c(const uint8_t *src_ptr,
   1.284 +                                 int  source_stride,
   1.285 +                                 const uint8_t *ref_ptr,
   1.286 +                                 int  recon_stride,
   1.287 +                                 unsigned int *sse) {
   1.288 +  unsigned int var;
   1.289 +  int avg;
   1.290 +
   1.291 +  variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16, &var, &avg);
   1.292 +  *sse = var;
   1.293 +  return (var - (((int64_t)avg * avg) >> 9));
   1.294 +}
   1.295 +
   1.296 +unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr,
   1.297 +                                           int  src_pixels_per_line,
   1.298 +                                           int  xoffset,
   1.299 +                                           int  yoffset,
   1.300 +                                           const uint8_t *dst_ptr,
   1.301 +                                           int dst_pixels_per_line,
   1.302 +                                           unsigned int *sse) {
   1.303 +  uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
   1.304 +  uint8_t temp2[36 * 32];
   1.305 +  const int16_t *hfilter, *vfilter;
   1.306 +
   1.307 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.308 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.309 +
   1.310 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.311 +                                    1, 17, 32, hfilter);
   1.312 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);
   1.313 +
   1.314 +  return vp9_variance32x16(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
   1.315 +}
   1.316 +
   1.317 +unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr,
   1.318 +                                               int  src_pixels_per_line,
   1.319 +                                               int  xoffset,
   1.320 +                                               int  yoffset,
   1.321 +                                               const uint8_t *dst_ptr,
   1.322 +                                               int dst_pixels_per_line,
   1.323 +                                               unsigned int *sse,
   1.324 +                                               const uint8_t *second_pred) {
   1.325 +  uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
   1.326 +  uint8_t temp2[36 * 32];
   1.327 +  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16);  // compound pred buffer
   1.328 +  const int16_t *hfilter, *vfilter;
   1.329 +
   1.330 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.331 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.332 +
   1.333 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.334 +                                    1, 17, 32, hfilter);
   1.335 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);
   1.336 +  comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32);
   1.337 +  return vp9_variance32x16(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
   1.338 +}
   1.339 +
   1.340 +unsigned int vp9_variance16x32_c(const uint8_t *src_ptr,
   1.341 +                                 int  source_stride,
   1.342 +                                 const uint8_t *ref_ptr,
   1.343 +                                 int  recon_stride,
   1.344 +                                 unsigned int *sse) {
   1.345 +  unsigned int var;
   1.346 +  int avg;
   1.347 +
   1.348 +  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32, &var, &avg);
   1.349 +  *sse = var;
   1.350 +  return (var - (((int64_t)avg * avg) >> 9));
   1.351 +}
   1.352 +
   1.353 +unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr,
   1.354 +                                           int  src_pixels_per_line,
   1.355 +                                           int  xoffset,
   1.356 +                                           int  yoffset,
   1.357 +                                           const uint8_t *dst_ptr,
   1.358 +                                           int dst_pixels_per_line,
   1.359 +                                           unsigned int *sse) {
   1.360 +  uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
   1.361 +  uint8_t temp2[36 * 32];
   1.362 +  const int16_t *hfilter, *vfilter;
   1.363 +
   1.364 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.365 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.366 +
   1.367 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.368 +                                    1, 33, 16, hfilter);
   1.369 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);
   1.370 +
   1.371 +  return vp9_variance16x32(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
   1.372 +}
   1.373 +
   1.374 +unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr,
   1.375 +                                               int  src_pixels_per_line,
   1.376 +                                               int  xoffset,
   1.377 +                                               int  yoffset,
   1.378 +                                               const uint8_t *dst_ptr,
   1.379 +                                               int dst_pixels_per_line,
   1.380 +                                               unsigned int *sse,
   1.381 +                                               const uint8_t *second_pred) {
   1.382 +  uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
   1.383 +  uint8_t temp2[36 * 32];
   1.384 +  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32);  // compound pred buffer
   1.385 +  const int16_t *hfilter, *vfilter;
   1.386 +
   1.387 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.388 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.389 +
   1.390 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.391 +                                    1, 33, 16, hfilter);
   1.392 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);
   1.393 +  comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16);
   1.394 +  return vp9_variance16x32(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
   1.395 +}
   1.396 +
   1.397 +unsigned int vp9_variance64x64_c(const uint8_t *src_ptr,
   1.398 +                                 int  source_stride,
   1.399 +                                 const uint8_t *ref_ptr,
   1.400 +                                 int  recon_stride,
   1.401 +                                 unsigned int *sse) {
   1.402 +  unsigned int var;
   1.403 +  int avg;
   1.404 +
   1.405 +  variance(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64, &var, &avg);
   1.406 +  *sse = var;
   1.407 +  return (var - (((int64_t)avg * avg) >> 12));
   1.408 +}
   1.409 +
   1.410 +unsigned int vp9_variance32x32_c(const uint8_t *src_ptr,
   1.411 +                                 int  source_stride,
   1.412 +                                 const uint8_t *ref_ptr,
   1.413 +                                 int  recon_stride,
   1.414 +                                 unsigned int *sse) {
   1.415 +  unsigned int var;
   1.416 +  int avg;
   1.417 +
   1.418 +  variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, &var, &avg);
   1.419 +  *sse = var;
   1.420 +  return (var - (((int64_t)avg * avg) >> 10));
   1.421 +}
   1.422 +
   1.423 +unsigned int vp9_variance16x16_c(const uint8_t *src_ptr,
   1.424 +                                 int  source_stride,
   1.425 +                                 const uint8_t *ref_ptr,
   1.426 +                                 int  recon_stride,
   1.427 +                                 unsigned int *sse) {
   1.428 +  unsigned int var;
   1.429 +  int avg;
   1.430 +
   1.431 +  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
   1.432 +  *sse = var;
   1.433 +  return (var - (((unsigned int)avg * avg) >> 8));
   1.434 +}
   1.435 +
   1.436 +unsigned int vp9_variance8x16_c(const uint8_t *src_ptr,
   1.437 +                                int  source_stride,
   1.438 +                                const uint8_t *ref_ptr,
   1.439 +                                int  recon_stride,
   1.440 +                                unsigned int *sse) {
   1.441 +  unsigned int var;
   1.442 +  int avg;
   1.443 +
   1.444 +  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
   1.445 +  *sse = var;
   1.446 +  return (var - (((unsigned int)avg * avg) >> 7));
   1.447 +}
   1.448 +
   1.449 +unsigned int vp9_variance16x8_c(const uint8_t *src_ptr,
   1.450 +                                int  source_stride,
   1.451 +                                const uint8_t *ref_ptr,
   1.452 +                                int  recon_stride,
   1.453 +                                unsigned int *sse) {
   1.454 +  unsigned int var;
   1.455 +  int avg;
   1.456 +
   1.457 +  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
   1.458 +  *sse = var;
   1.459 +  return (var - (((unsigned int)avg * avg) >> 7));
   1.460 +}
   1.461 +
   1.462 +void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride,
   1.463 +                       const uint8_t *ref_ptr, int ref_stride,
   1.464 +                       unsigned int *sse, int *sum) {
   1.465 +  variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
   1.466 +}
   1.467 +
   1.468 +unsigned int vp9_variance8x8_c(const uint8_t *src_ptr,
   1.469 +                               int  source_stride,
   1.470 +                               const uint8_t *ref_ptr,
   1.471 +                               int  recon_stride,
   1.472 +                               unsigned int *sse) {
   1.473 +  unsigned int var;
   1.474 +  int avg;
   1.475 +
   1.476 +  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
   1.477 +  *sse = var;
   1.478 +  return (var - (((unsigned int)avg * avg) >> 6));
   1.479 +}
   1.480 +
   1.481 +unsigned int vp9_variance8x4_c(const uint8_t *src_ptr,
   1.482 +                               int  source_stride,
   1.483 +                               const uint8_t *ref_ptr,
   1.484 +                               int  recon_stride,
   1.485 +                               unsigned int *sse) {
   1.486 +  unsigned int var;
   1.487 +  int avg;
   1.488 +
   1.489 +  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4, &var, &avg);
   1.490 +  *sse = var;
   1.491 +  return (var - (((unsigned int)avg * avg) >> 5));
   1.492 +}
   1.493 +
   1.494 +unsigned int vp9_variance4x8_c(const uint8_t *src_ptr,
   1.495 +                               int  source_stride,
   1.496 +                               const uint8_t *ref_ptr,
   1.497 +                               int  recon_stride,
   1.498 +                               unsigned int *sse) {
   1.499 +  unsigned int var;
   1.500 +  int avg;
   1.501 +
   1.502 +  variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8, &var, &avg);
   1.503 +  *sse = var;
   1.504 +  return (var - (((unsigned int)avg * avg) >> 5));
   1.505 +}
   1.506 +
   1.507 +unsigned int vp9_variance4x4_c(const uint8_t *src_ptr,
   1.508 +                               int  source_stride,
   1.509 +                               const uint8_t *ref_ptr,
   1.510 +                               int  recon_stride,
   1.511 +                               unsigned int *sse) {
   1.512 +  unsigned int var;
   1.513 +  int avg;
   1.514 +
   1.515 +  variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
   1.516 +  *sse = var;
   1.517 +  return (var - (((unsigned int)avg * avg) >> 4));
   1.518 +}
   1.519 +
   1.520 +
   1.521 +unsigned int vp9_mse16x16_c(const uint8_t *src_ptr,
   1.522 +                            int  source_stride,
   1.523 +                            const uint8_t *ref_ptr,
   1.524 +                            int  recon_stride,
   1.525 +                            unsigned int *sse) {
   1.526 +  unsigned int var;
   1.527 +  int avg;
   1.528 +
   1.529 +  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
   1.530 +  *sse = var;
   1.531 +  return var;
   1.532 +}
   1.533 +
   1.534 +unsigned int vp9_mse16x8_c(const uint8_t *src_ptr,
   1.535 +                           int  source_stride,
   1.536 +                           const uint8_t *ref_ptr,
   1.537 +                           int  recon_stride,
   1.538 +                           unsigned int *sse) {
   1.539 +  unsigned int var;
   1.540 +  int avg;
   1.541 +
   1.542 +  variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
   1.543 +  *sse = var;
   1.544 +  return var;
   1.545 +}
   1.546 +
   1.547 +unsigned int vp9_mse8x16_c(const uint8_t *src_ptr,
   1.548 +                           int  source_stride,
   1.549 +                           const uint8_t *ref_ptr,
   1.550 +                           int  recon_stride,
   1.551 +                           unsigned int *sse) {
   1.552 +  unsigned int var;
   1.553 +  int avg;
   1.554 +
   1.555 +  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
   1.556 +  *sse = var;
   1.557 +  return var;
   1.558 +}
   1.559 +
   1.560 +unsigned int vp9_mse8x8_c(const uint8_t *src_ptr,
   1.561 +                          int  source_stride,
   1.562 +                          const uint8_t *ref_ptr,
   1.563 +                          int  recon_stride,
   1.564 +                          unsigned int *sse) {
   1.565 +  unsigned int var;
   1.566 +  int avg;
   1.567 +
   1.568 +  variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
   1.569 +  *sse = var;
   1.570 +  return var;
   1.571 +}
   1.572 +
   1.573 +
   1.574 +unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr,
   1.575 +                                         int  src_pixels_per_line,
   1.576 +                                         int  xoffset,
   1.577 +                                         int  yoffset,
   1.578 +                                         const uint8_t *dst_ptr,
   1.579 +                                         int dst_pixels_per_line,
   1.580 +                                         unsigned int *sse) {
   1.581 +  uint8_t temp2[20 * 16];
   1.582 +  const int16_t *hfilter, *vfilter;
   1.583 +  uint16_t fdata3[5 * 4];  // Temp data buffer used in filtering
   1.584 +
   1.585 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.586 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.587 +
   1.588 +  // First filter 1d Horizontal
   1.589 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.590 +                                    1, 5, 4, hfilter);
   1.591 +
   1.592 +  // Now filter Verticaly
   1.593 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 4,  4,  4,  4, vfilter);
   1.594 +
   1.595 +  return vp9_variance4x4(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
   1.596 +}
   1.597 +
   1.598 +unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr,
   1.599 +                                             int  src_pixels_per_line,
   1.600 +                                             int  xoffset,
   1.601 +                                             int  yoffset,
   1.602 +                                             const uint8_t *dst_ptr,
   1.603 +                                             int dst_pixels_per_line,
   1.604 +                                             unsigned int *sse,
   1.605 +                                             const uint8_t *second_pred) {
   1.606 +  uint8_t temp2[20 * 16];
   1.607 +  const int16_t *hfilter, *vfilter;
   1.608 +  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4);  // compound pred buffer
   1.609 +  uint16_t fdata3[5 * 4];  // Temp data buffer used in filtering
   1.610 +
   1.611 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.612 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.613 +
   1.614 +  // First filter 1d Horizontal
   1.615 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.616 +                                    1, 5, 4, hfilter);
   1.617 +
   1.618 +  // Now filter Verticaly
   1.619 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 4,  4,  4,  4, vfilter);
   1.620 +  comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4);
   1.621 +  return vp9_variance4x4(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
   1.622 +}
   1.623 +
   1.624 +unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
   1.625 +                                         int  src_pixels_per_line,
   1.626 +                                         int  xoffset,
   1.627 +                                         int  yoffset,
   1.628 +                                         const uint8_t *dst_ptr,
   1.629 +                                         int dst_pixels_per_line,
   1.630 +                                         unsigned int *sse) {
   1.631 +  uint16_t fdata3[9 * 8];  // Temp data buffer used in filtering
   1.632 +  uint8_t temp2[20 * 16];
   1.633 +  const int16_t *hfilter, *vfilter;
   1.634 +
   1.635 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.636 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.637 +
   1.638 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.639 +                                    1, 9, 8, hfilter);
   1.640 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);
   1.641 +
   1.642 +  return vp9_variance8x8(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
   1.643 +}
   1.644 +
   1.645 +unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr,
   1.646 +                                             int  src_pixels_per_line,
   1.647 +                                             int  xoffset,
   1.648 +                                             int  yoffset,
   1.649 +                                             const uint8_t *dst_ptr,
   1.650 +                                             int dst_pixels_per_line,
   1.651 +                                             unsigned int *sse,
   1.652 +                                             const uint8_t *second_pred) {
   1.653 +  uint16_t fdata3[9 * 8];  // Temp data buffer used in filtering
   1.654 +  uint8_t temp2[20 * 16];
   1.655 +  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8);  // compound pred buffer
   1.656 +  const int16_t *hfilter, *vfilter;
   1.657 +
   1.658 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.659 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.660 +
   1.661 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.662 +                                    1, 9, 8, hfilter);
   1.663 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);
   1.664 +  comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8);
   1.665 +  return vp9_variance8x8(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
   1.666 +}
   1.667 +
   1.668 +unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
   1.669 +                                           int  src_pixels_per_line,
   1.670 +                                           int  xoffset,
   1.671 +                                           int  yoffset,
   1.672 +                                           const uint8_t *dst_ptr,
   1.673 +                                           int dst_pixels_per_line,
   1.674 +                                           unsigned int *sse) {
   1.675 +  uint16_t fdata3[17 * 16];  // Temp data buffer used in filtering
   1.676 +  uint8_t temp2[20 * 16];
   1.677 +  const int16_t *hfilter, *vfilter;
   1.678 +
   1.679 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.680 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.681 +
   1.682 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.683 +                                    1, 17, 16, hfilter);
   1.684 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);
   1.685 +
   1.686 +  return vp9_variance16x16(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
   1.687 +}
   1.688 +
   1.689 +unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr,
   1.690 +                                               int  src_pixels_per_line,
   1.691 +                                               int  xoffset,
   1.692 +                                               int  yoffset,
   1.693 +                                               const uint8_t *dst_ptr,
   1.694 +                                               int dst_pixels_per_line,
   1.695 +                                               unsigned int *sse,
   1.696 +                                               const uint8_t *second_pred) {
   1.697 +  uint16_t fdata3[17 * 16];
   1.698 +  uint8_t temp2[20 * 16];
   1.699 +  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16);  // compound pred buffer
   1.700 +  const int16_t *hfilter, *vfilter;
   1.701 +
   1.702 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.703 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.704 +
   1.705 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.706 +                                    1, 17, 16, hfilter);
   1.707 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);
   1.708 +
   1.709 +  comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16);
   1.710 +  return vp9_variance16x16(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
   1.711 +}
   1.712 +
   1.713 +unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
   1.714 +                                           int  src_pixels_per_line,
   1.715 +                                           int  xoffset,
   1.716 +                                           int  yoffset,
   1.717 +                                           const uint8_t *dst_ptr,
   1.718 +                                           int dst_pixels_per_line,
   1.719 +                                           unsigned int *sse) {
   1.720 +  uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
   1.721 +  uint8_t temp2[68 * 64];
   1.722 +  const int16_t *hfilter, *vfilter;
   1.723 +
   1.724 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.725 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.726 +
   1.727 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.728 +                                    1, 65, 64, hfilter);
   1.729 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);
   1.730 +
   1.731 +  return vp9_variance64x64(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
   1.732 +}
   1.733 +
   1.734 +unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr,
   1.735 +                                               int  src_pixels_per_line,
   1.736 +                                               int  xoffset,
   1.737 +                                               int  yoffset,
   1.738 +                                               const uint8_t *dst_ptr,
   1.739 +                                               int dst_pixels_per_line,
   1.740 +                                               unsigned int *sse,
   1.741 +                                               const uint8_t *second_pred) {
   1.742 +  uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
   1.743 +  uint8_t temp2[68 * 64];
   1.744 +  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
   1.745 +  const int16_t *hfilter, *vfilter;
   1.746 +
   1.747 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.748 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.749 +
   1.750 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.751 +                                    1, 65, 64, hfilter);
   1.752 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);
   1.753 +  comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64);
   1.754 +  return vp9_variance64x64(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
   1.755 +}
   1.756 +
   1.757 +unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
   1.758 +                                           int  src_pixels_per_line,
   1.759 +                                           int  xoffset,
   1.760 +                                           int  yoffset,
   1.761 +                                           const uint8_t *dst_ptr,
   1.762 +                                           int dst_pixels_per_line,
   1.763 +                                           unsigned int *sse) {
   1.764 +  uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
   1.765 +  uint8_t temp2[36 * 32];
   1.766 +  const int16_t *hfilter, *vfilter;
   1.767 +
   1.768 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.769 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.770 +
   1.771 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.772 +                                    1, 33, 32, hfilter);
   1.773 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);
   1.774 +
   1.775 +  return vp9_variance32x32(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
   1.776 +}
   1.777 +
   1.778 +unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr,
   1.779 +                                               int  src_pixels_per_line,
   1.780 +                                               int  xoffset,
   1.781 +                                               int  yoffset,
   1.782 +                                               const uint8_t *dst_ptr,
   1.783 +                                               int dst_pixels_per_line,
   1.784 +                                               unsigned int *sse,
   1.785 +                                               const uint8_t *second_pred) {
   1.786 +  uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
   1.787 +  uint8_t temp2[36 * 32];
   1.788 +  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32);  // compound pred buffer
   1.789 +  const int16_t *hfilter, *vfilter;
   1.790 +
   1.791 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.792 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.793 +
   1.794 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.795 +                                    1, 33, 32, hfilter);
   1.796 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);
   1.797 +  comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32);
   1.798 +  return vp9_variance32x32(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
   1.799 +}
   1.800 +
   1.801 +unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr,
   1.802 +                                              int  source_stride,
   1.803 +                                              const uint8_t *ref_ptr,
   1.804 +                                              int  recon_stride,
   1.805 +                                              unsigned int *sse) {
   1.806 +  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 0,
   1.807 +                                       ref_ptr, recon_stride, sse);
   1.808 +}
   1.809 +
   1.810 +unsigned int vp9_variance_halfpixvar32x32_h_c(const uint8_t *src_ptr,
   1.811 +                                              int  source_stride,
   1.812 +                                              const uint8_t *ref_ptr,
   1.813 +                                              int  recon_stride,
   1.814 +                                              unsigned int *sse) {
   1.815 +  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0,
   1.816 +                                       ref_ptr, recon_stride, sse);
   1.817 +}
   1.818 +
   1.819 +unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr,
   1.820 +                                              int  source_stride,
   1.821 +                                              const uint8_t *ref_ptr,
   1.822 +                                              int  recon_stride,
   1.823 +                                              unsigned int *sse) {
   1.824 +  return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 8, 0,
   1.825 +                                       ref_ptr, recon_stride, sse);
   1.826 +}
   1.827 +
   1.828 +unsigned int vp9_variance_halfpixvar16x16_v_c(const uint8_t *src_ptr,
   1.829 +                                              int  source_stride,
   1.830 +                                              const uint8_t *ref_ptr,
   1.831 +                                              int  recon_stride,
   1.832 +                                              unsigned int *sse) {
   1.833 +  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,
   1.834 +                                       ref_ptr, recon_stride, sse);
   1.835 +}
   1.836 +
   1.837 +unsigned int vp9_variance_halfpixvar32x32_v_c(const uint8_t *src_ptr,
   1.838 +                                              int  source_stride,
   1.839 +                                              const uint8_t *ref_ptr,
   1.840 +                                              int  recon_stride,
   1.841 +                                              unsigned int *sse) {
   1.842 +  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8,
   1.843 +                                       ref_ptr, recon_stride, sse);
   1.844 +}
   1.845 +
   1.846 +unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr,
   1.847 +                                              int  source_stride,
   1.848 +                                              const uint8_t *ref_ptr,
   1.849 +                                              int  recon_stride,
   1.850 +                                              unsigned int *sse) {
   1.851 +  return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 0, 8,
   1.852 +                                       ref_ptr, recon_stride, sse);
   1.853 +}
   1.854 +
   1.855 +unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr,
   1.856 +                                               int  source_stride,
   1.857 +                                               const uint8_t *ref_ptr,
   1.858 +                                               int  recon_stride,
   1.859 +                                               unsigned int *sse) {
   1.860 +  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 8,
   1.861 +                                       ref_ptr, recon_stride, sse);
   1.862 +}
   1.863 +
   1.864 +unsigned int vp9_variance_halfpixvar32x32_hv_c(const uint8_t *src_ptr,
   1.865 +                                               int  source_stride,
   1.866 +                                               const uint8_t *ref_ptr,
   1.867 +                                               int  recon_stride,
   1.868 +                                               unsigned int *sse) {
   1.869 +  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8,
   1.870 +                                       ref_ptr, recon_stride, sse);
   1.871 +}
   1.872 +
   1.873 +unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr,
   1.874 +                                               int  source_stride,
   1.875 +                                               const uint8_t *ref_ptr,
   1.876 +                                               int  recon_stride,
   1.877 +                                               unsigned int *sse) {
   1.878 +  return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 8, 8,
   1.879 +                                       ref_ptr, recon_stride, sse);
   1.880 +}
   1.881 +
   1.882 +unsigned int vp9_sub_pixel_mse16x16_c(const uint8_t *src_ptr,
   1.883 +                                      int  src_pixels_per_line,
   1.884 +                                      int  xoffset,
   1.885 +                                      int  yoffset,
   1.886 +                                      const uint8_t *dst_ptr,
   1.887 +                                      int dst_pixels_per_line,
   1.888 +                                      unsigned int *sse) {
   1.889 +  vp9_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line,
   1.890 +                                xoffset, yoffset, dst_ptr,
   1.891 +                                dst_pixels_per_line, sse);
   1.892 +  return *sse;
   1.893 +}
   1.894 +
   1.895 +unsigned int vp9_sub_pixel_mse32x32_c(const uint8_t *src_ptr,
   1.896 +                                      int  src_pixels_per_line,
   1.897 +                                      int  xoffset,
   1.898 +                                      int  yoffset,
   1.899 +                                      const uint8_t *dst_ptr,
   1.900 +                                      int dst_pixels_per_line,
   1.901 +                                      unsigned int *sse) {
   1.902 +  vp9_sub_pixel_variance32x32_c(src_ptr, src_pixels_per_line,
   1.903 +                                xoffset, yoffset, dst_ptr,
   1.904 +                                dst_pixels_per_line, sse);
   1.905 +  return *sse;
   1.906 +}
   1.907 +
   1.908 +unsigned int vp9_sub_pixel_mse64x64_c(const uint8_t *src_ptr,
   1.909 +                                      int  src_pixels_per_line,
   1.910 +                                      int  xoffset,
   1.911 +                                      int  yoffset,
   1.912 +                                      const uint8_t *dst_ptr,
   1.913 +                                      int dst_pixels_per_line,
   1.914 +                                      unsigned int *sse) {
   1.915 +  vp9_sub_pixel_variance64x64_c(src_ptr, src_pixels_per_line,
   1.916 +                                xoffset, yoffset, dst_ptr,
   1.917 +                                dst_pixels_per_line, sse);
   1.918 +  return *sse;
   1.919 +}
   1.920 +
   1.921 +unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr,
   1.922 +                                          int  src_pixels_per_line,
   1.923 +                                          int  xoffset,
   1.924 +                                          int  yoffset,
   1.925 +                                          const uint8_t *dst_ptr,
   1.926 +                                          int dst_pixels_per_line,
   1.927 +                                          unsigned int *sse) {
   1.928 +  uint16_t fdata3[16 * 9];  // Temp data buffer used in filtering
   1.929 +  uint8_t temp2[20 * 16];
   1.930 +  const int16_t *hfilter, *vfilter;
   1.931 +
   1.932 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.933 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.934 +
   1.935 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.936 +                                    1, 9, 16, hfilter);
   1.937 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);
   1.938 +
   1.939 +  return vp9_variance16x8(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
   1.940 +}
   1.941 +
   1.942 +unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr,
   1.943 +                                              int  src_pixels_per_line,
   1.944 +                                              int  xoffset,
   1.945 +                                              int  yoffset,
   1.946 +                                              const uint8_t *dst_ptr,
   1.947 +                                              int dst_pixels_per_line,
   1.948 +                                              unsigned int *sse,
   1.949 +                                              const uint8_t *second_pred) {
   1.950 +  uint16_t fdata3[16 * 9];  // Temp data buffer used in filtering
   1.951 +  uint8_t temp2[20 * 16];
   1.952 +  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8);  // compound pred buffer
   1.953 +  const int16_t *hfilter, *vfilter;
   1.954 +
   1.955 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.956 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.957 +
   1.958 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.959 +                                    1, 9, 16, hfilter);
   1.960 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);
   1.961 +  comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16);
   1.962 +  return vp9_variance16x8(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
   1.963 +}
   1.964 +
   1.965 +unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
   1.966 +                                          int  src_pixels_per_line,
   1.967 +                                          int  xoffset,
   1.968 +                                          int  yoffset,
   1.969 +                                          const uint8_t *dst_ptr,
   1.970 +                                          int dst_pixels_per_line,
   1.971 +                                          unsigned int *sse) {
   1.972 +  uint16_t fdata3[9 * 16];  // Temp data buffer used in filtering
   1.973 +  uint8_t temp2[20 * 16];
   1.974 +  const int16_t *hfilter, *vfilter;
   1.975 +
   1.976 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1.977 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1.978 +
   1.979 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1.980 +                                    1, 17, 8, hfilter);
   1.981 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);
   1.982 +
   1.983 +  return vp9_variance8x16(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
   1.984 +}
   1.985 +
   1.986 +unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr,
   1.987 +                                              int  src_pixels_per_line,
   1.988 +                                              int  xoffset,
   1.989 +                                              int  yoffset,
   1.990 +                                              const uint8_t *dst_ptr,
   1.991 +                                              int dst_pixels_per_line,
   1.992 +                                              unsigned int *sse,
   1.993 +                                              const uint8_t *second_pred) {
   1.994 +  uint16_t fdata3[9 * 16];  // Temp data buffer used in filtering
   1.995 +  uint8_t temp2[20 * 16];
   1.996 +  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16);  // compound pred buffer
   1.997 +  const int16_t *hfilter, *vfilter;
   1.998 +
   1.999 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
  1.1000 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
  1.1001 +
  1.1002 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
  1.1003 +                                    1, 17, 8, hfilter);
  1.1004 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);
  1.1005 +  comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8);
  1.1006 +  return vp9_variance8x16(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
  1.1007 +}
  1.1008 +
  1.1009 +unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr,
  1.1010 +                                         int  src_pixels_per_line,
  1.1011 +                                         int  xoffset,
  1.1012 +                                         int  yoffset,
  1.1013 +                                         const uint8_t *dst_ptr,
  1.1014 +                                         int dst_pixels_per_line,
  1.1015 +                                         unsigned int *sse) {
  1.1016 +  uint16_t fdata3[8 * 5];  // Temp data buffer used in filtering
  1.1017 +  uint8_t temp2[20 * 16];
  1.1018 +  const int16_t *hfilter, *vfilter;
  1.1019 +
  1.1020 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
  1.1021 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
  1.1022 +
  1.1023 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
  1.1024 +                                    1, 5, 8, hfilter);
  1.1025 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter);
  1.1026 +
  1.1027 +  return vp9_variance8x4(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
  1.1028 +}
  1.1029 +
  1.1030 +unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr,
  1.1031 +                                             int  src_pixels_per_line,
  1.1032 +                                             int  xoffset,
  1.1033 +                                             int  yoffset,
  1.1034 +                                             const uint8_t *dst_ptr,
  1.1035 +                                             int dst_pixels_per_line,
  1.1036 +                                             unsigned int *sse,
  1.1037 +                                             const uint8_t *second_pred) {
  1.1038 +  uint16_t fdata3[8 * 5];  // Temp data buffer used in filtering
  1.1039 +  uint8_t temp2[20 * 16];
  1.1040 +  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 4);  // compound pred buffer
  1.1041 +  const int16_t *hfilter, *vfilter;
  1.1042 +
  1.1043 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
  1.1044 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
  1.1045 +
  1.1046 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
  1.1047 +                                    1, 5, 8, hfilter);
  1.1048 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter);
  1.1049 +  comp_avg_pred(temp3, second_pred, 8, 4, temp2, 8);
  1.1050 +  return vp9_variance8x4(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
  1.1051 +}
  1.1052 +
  1.1053 +unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr,
  1.1054 +                                         int  src_pixels_per_line,
  1.1055 +                                         int  xoffset,
  1.1056 +                                         int  yoffset,
  1.1057 +                                         const uint8_t *dst_ptr,
  1.1058 +                                         int dst_pixels_per_line,
  1.1059 +                                         unsigned int *sse) {
  1.1060 +  uint16_t fdata3[5 * 8];  // Temp data buffer used in filtering
  1.1061 +  // FIXME(jingning,rbultje): this temp2 buffer probably doesn't need to be
  1.1062 +  // of this big? same issue appears in all other block size settings.
  1.1063 +  uint8_t temp2[20 * 16];
  1.1064 +  const int16_t *hfilter, *vfilter;
  1.1065 +
  1.1066 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
  1.1067 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
  1.1068 +
  1.1069 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
  1.1070 +                                    1, 9, 4, hfilter);
  1.1071 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter);
  1.1072 +
  1.1073 +  return vp9_variance4x8(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
  1.1074 +}
  1.1075 +
  1.1076 +unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr,
  1.1077 +                                             int  src_pixels_per_line,
  1.1078 +                                             int  xoffset,
  1.1079 +                                             int  yoffset,
  1.1080 +                                             const uint8_t *dst_ptr,
  1.1081 +                                             int dst_pixels_per_line,
  1.1082 +                                             unsigned int *sse,
  1.1083 +                                             const uint8_t *second_pred) {
  1.1084 +  uint16_t fdata3[5 * 8];  // Temp data buffer used in filtering
  1.1085 +  uint8_t temp2[20 * 16];
  1.1086 +  DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 8);  // compound pred buffer
  1.1087 +  const int16_t *hfilter, *vfilter;
  1.1088 +
  1.1089 +  hfilter = BILINEAR_FILTERS_2TAP(xoffset);
  1.1090 +  vfilter = BILINEAR_FILTERS_2TAP(yoffset);
  1.1091 +
  1.1092 +  var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
  1.1093 +                                    1, 9, 4, hfilter);
  1.1094 +  var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter);
  1.1095 +  comp_avg_pred(temp3, second_pred, 4, 8, temp2, 4);
  1.1096 +  return vp9_variance4x8(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
  1.1097 +}

mercurial