media/libvpx/vp9/encoder/vp9_encodemb.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp9/encoder/vp9_encodemb.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,682 @@
     1.4 +/*
     1.5 + *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 + *
     1.7 + *  Use of this source code is governed by a BSD-style license
     1.8 + *  that can be found in the LICENSE file in the root of the source
     1.9 + *  tree. An additional intellectual property rights grant can be found
    1.10 + *  in the file PATENTS.  All contributing project authors may
    1.11 + *  be found in the AUTHORS file in the root of the source tree.
    1.12 + */
    1.13 +
    1.14 +
    1.15 +#include "./vp9_rtcd.h"
    1.16 +#include "./vpx_config.h"
    1.17 +
    1.18 +#include "vpx_mem/vpx_mem.h"
    1.19 +
    1.20 +#include "vp9/common/vp9_idct.h"
    1.21 +#include "vp9/common/vp9_reconinter.h"
    1.22 +#include "vp9/common/vp9_reconintra.h"
    1.23 +#include "vp9/common/vp9_systemdependent.h"
    1.24 +
    1.25 +#include "vp9/encoder/vp9_dct.h"
    1.26 +#include "vp9/encoder/vp9_encodemb.h"
    1.27 +#include "vp9/encoder/vp9_quantize.h"
    1.28 +#include "vp9/encoder/vp9_rdopt.h"
    1.29 +#include "vp9/encoder/vp9_tokenize.h"
    1.30 +
    1.31 +void vp9_subtract_block_c(int rows, int cols,
    1.32 +                          int16_t *diff_ptr, ptrdiff_t diff_stride,
    1.33 +                          const uint8_t *src_ptr, ptrdiff_t src_stride,
    1.34 +                          const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
    1.35 +  int r, c;
    1.36 +
    1.37 +  for (r = 0; r < rows; r++) {
    1.38 +    for (c = 0; c < cols; c++)
    1.39 +      diff_ptr[c] = src_ptr[c] - pred_ptr[c];
    1.40 +
    1.41 +    diff_ptr += diff_stride;
    1.42 +    pred_ptr += pred_stride;
    1.43 +    src_ptr  += src_stride;
    1.44 +  }
    1.45 +}
    1.46 +
    1.47 +static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
    1.48 +  struct macroblock_plane *const p = &x->plane[plane];
    1.49 +  const MACROBLOCKD *const xd = &x->e_mbd;
    1.50 +  const struct macroblockd_plane *const pd = &xd->plane[plane];
    1.51 +  const int bw = plane_block_width(bsize, pd);
    1.52 +  const int bh = plane_block_height(bsize, pd);
    1.53 +
    1.54 +  vp9_subtract_block(bh, bw, p->src_diff, bw,
    1.55 +                     p->src.buf, p->src.stride,
    1.56 +                     pd->dst.buf, pd->dst.stride);
    1.57 +}
    1.58 +
    1.59 +void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
    1.60 +  subtract_plane(x, bsize, 0);
    1.61 +}
    1.62 +
    1.63 +void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize) {
    1.64 +  int i;
    1.65 +
    1.66 +  for (i = 1; i < MAX_MB_PLANE; i++)
    1.67 +    subtract_plane(x, bsize, i);
    1.68 +}
    1.69 +
    1.70 +void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
    1.71 +  vp9_subtract_sby(x, bsize);
    1.72 +  vp9_subtract_sbuv(x, bsize);
    1.73 +}
    1.74 +
    1.75 +#define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
    1.76 +typedef struct vp9_token_state vp9_token_state;
    1.77 +
    1.78 +struct vp9_token_state {
    1.79 +  int           rate;
    1.80 +  int           error;
    1.81 +  int           next;
    1.82 +  signed char   token;
    1.83 +  short         qc;
    1.84 +};
    1.85 +
    1.86 +// TODO(jimbankoski): experiment to find optimal RD numbers.
    1.87 +#define Y1_RD_MULT 4
    1.88 +#define UV_RD_MULT 2
    1.89 +
    1.90 +static const int plane_rd_mult[4] = {
    1.91 +  Y1_RD_MULT,
    1.92 +  UV_RD_MULT,
    1.93 +};
    1.94 +
    1.95 +#define UPDATE_RD_COST()\
    1.96 +{\
    1.97 +  rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\
    1.98 +  rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
    1.99 +  if (rd_cost0 == rd_cost1) {\
   1.100 +    rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\
   1.101 +    rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\
   1.102 +  }\
   1.103 +}
   1.104 +
   1.105 +// This function is a place holder for now but may ultimately need
   1.106 +// to scan previous tokens to work out the correct context.
   1.107 +static int trellis_get_coeff_context(const int16_t *scan,
   1.108 +                                     const int16_t *nb,
   1.109 +                                     int idx, int token,
   1.110 +                                     uint8_t *token_cache) {
   1.111 +  int bak = token_cache[scan[idx]], pt;
   1.112 +  token_cache[scan[idx]] = vp9_pt_energy_class[token];
   1.113 +  pt = get_coef_context(nb, token_cache, idx + 1);
   1.114 +  token_cache[scan[idx]] = bak;
   1.115 +  return pt;
   1.116 +}
   1.117 +
   1.118 +static void optimize_b(MACROBLOCK *mb,
   1.119 +                       int plane, int block, BLOCK_SIZE plane_bsize,
   1.120 +                       ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
   1.121 +                       TX_SIZE tx_size) {
   1.122 +  MACROBLOCKD *const xd = &mb->e_mbd;
   1.123 +  struct macroblockd_plane *pd = &xd->plane[plane];
   1.124 +  const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
   1.125 +  vp9_token_state tokens[1025][2];
   1.126 +  unsigned best_index[1025][2];
   1.127 +  const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff, block);
   1.128 +  int16_t *qcoeff_ptr;
   1.129 +  int16_t *dqcoeff_ptr;
   1.130 +  int eob = pd->eobs[block], final_eob, sz = 0;
   1.131 +  const int i0 = 0;
   1.132 +  int rc, x, next, i;
   1.133 +  int64_t rdmult, rddiv, rd_cost0, rd_cost1;
   1.134 +  int rate0, rate1, error0, error1, t0, t1;
   1.135 +  int best, band, pt;
   1.136 +  PLANE_TYPE type = pd->plane_type;
   1.137 +  int err_mult = plane_rd_mult[type];
   1.138 +  const int default_eob = 16 << (tx_size << 1);
   1.139 +  const int16_t *scan, *nb;
   1.140 +  const int mul = 1 + (tx_size == TX_32X32);
   1.141 +  uint8_t token_cache[1024];
   1.142 +  const int16_t *dequant_ptr = pd->dequant;
   1.143 +  const uint8_t *const band_translate = get_band_translate(tx_size);
   1.144 +
   1.145 +  assert((!type && !plane) || (type && plane));
   1.146 +  dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
   1.147 +  qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
   1.148 +  get_scan(xd, tx_size, type, block, &scan, &nb);
   1.149 +  assert(eob <= default_eob);
   1.150 +
   1.151 +  /* Now set up a Viterbi trellis to evaluate alternative roundings. */
   1.152 +  rdmult = mb->rdmult * err_mult;
   1.153 +  if (mb->e_mbd.mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME)
   1.154 +    rdmult = (rdmult * 9) >> 4;
   1.155 +  rddiv = mb->rddiv;
   1.156 +  /* Initialize the sentinel node of the trellis. */
   1.157 +  tokens[eob][0].rate = 0;
   1.158 +  tokens[eob][0].error = 0;
   1.159 +  tokens[eob][0].next = default_eob;
   1.160 +  tokens[eob][0].token = DCT_EOB_TOKEN;
   1.161 +  tokens[eob][0].qc = 0;
   1.162 +  *(tokens[eob] + 1) = *(tokens[eob] + 0);
   1.163 +  next = eob;
   1.164 +  for (i = 0; i < eob; i++)
   1.165 +    token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[
   1.166 +        qcoeff_ptr[scan[i]]].token];
   1.167 +
   1.168 +  for (i = eob; i-- > i0;) {
   1.169 +    int base_bits, d2, dx;
   1.170 +
   1.171 +    rc = scan[i];
   1.172 +    x = qcoeff_ptr[rc];
   1.173 +    /* Only add a trellis state for non-zero coefficients. */
   1.174 +    if (x) {
   1.175 +      int shortcut = 0;
   1.176 +      error0 = tokens[next][0].error;
   1.177 +      error1 = tokens[next][1].error;
   1.178 +      /* Evaluate the first possibility for this state. */
   1.179 +      rate0 = tokens[next][0].rate;
   1.180 +      rate1 = tokens[next][1].rate;
   1.181 +      t0 = (vp9_dct_value_tokens_ptr + x)->token;
   1.182 +      /* Consider both possible successor states. */
   1.183 +      if (next < default_eob) {
   1.184 +        band = band_translate[i + 1];
   1.185 +        pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
   1.186 +        rate0 +=
   1.187 +          mb->token_costs[tx_size][type][ref][band][0][pt]
   1.188 +                         [tokens[next][0].token];
   1.189 +        rate1 +=
   1.190 +          mb->token_costs[tx_size][type][ref][band][0][pt]
   1.191 +                         [tokens[next][1].token];
   1.192 +      }
   1.193 +      UPDATE_RD_COST();
   1.194 +      /* And pick the best. */
   1.195 +      best = rd_cost1 < rd_cost0;
   1.196 +      base_bits = *(vp9_dct_value_cost_ptr + x);
   1.197 +      dx = mul * (dqcoeff_ptr[rc] - coeff_ptr[rc]);
   1.198 +      d2 = dx * dx;
   1.199 +      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
   1.200 +      tokens[i][0].error = d2 + (best ? error1 : error0);
   1.201 +      tokens[i][0].next = next;
   1.202 +      tokens[i][0].token = t0;
   1.203 +      tokens[i][0].qc = x;
   1.204 +      best_index[i][0] = best;
   1.205 +
   1.206 +      /* Evaluate the second possibility for this state. */
   1.207 +      rate0 = tokens[next][0].rate;
   1.208 +      rate1 = tokens[next][1].rate;
   1.209 +
   1.210 +      if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc]) * mul) &&
   1.211 +          (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) * mul +
   1.212 +                                         dequant_ptr[rc != 0]))
   1.213 +        shortcut = 1;
   1.214 +      else
   1.215 +        shortcut = 0;
   1.216 +
   1.217 +      if (shortcut) {
   1.218 +        sz = -(x < 0);
   1.219 +        x -= 2 * sz + 1;
   1.220 +      }
   1.221 +
   1.222 +      /* Consider both possible successor states. */
   1.223 +      if (!x) {
   1.224 +        /* If we reduced this coefficient to zero, check to see if
   1.225 +         *  we need to move the EOB back here.
   1.226 +         */
   1.227 +        t0 = tokens[next][0].token == DCT_EOB_TOKEN ?
   1.228 +             DCT_EOB_TOKEN : ZERO_TOKEN;
   1.229 +        t1 = tokens[next][1].token == DCT_EOB_TOKEN ?
   1.230 +             DCT_EOB_TOKEN : ZERO_TOKEN;
   1.231 +      } else {
   1.232 +        t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token;
   1.233 +      }
   1.234 +      if (next < default_eob) {
   1.235 +        band = band_translate[i + 1];
   1.236 +        if (t0 != DCT_EOB_TOKEN) {
   1.237 +          pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
   1.238 +          rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
   1.239 +                                  [tokens[next][0].token];
   1.240 +        }
   1.241 +        if (t1 != DCT_EOB_TOKEN) {
   1.242 +          pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
   1.243 +          rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt]
   1.244 +                                  [tokens[next][1].token];
   1.245 +        }
   1.246 +      }
   1.247 +
   1.248 +      UPDATE_RD_COST();
   1.249 +      /* And pick the best. */
   1.250 +      best = rd_cost1 < rd_cost0;
   1.251 +      base_bits = *(vp9_dct_value_cost_ptr + x);
   1.252 +
   1.253 +      if (shortcut) {
   1.254 +        dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
   1.255 +        d2 = dx * dx;
   1.256 +      }
   1.257 +      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
   1.258 +      tokens[i][1].error = d2 + (best ? error1 : error0);
   1.259 +      tokens[i][1].next = next;
   1.260 +      tokens[i][1].token = best ? t1 : t0;
   1.261 +      tokens[i][1].qc = x;
   1.262 +      best_index[i][1] = best;
   1.263 +      /* Finally, make this the new head of the trellis. */
   1.264 +      next = i;
   1.265 +    } else {
   1.266 +      /* There's no choice to make for a zero coefficient, so we don't
   1.267 +       *  add a new trellis node, but we do need to update the costs.
   1.268 +       */
   1.269 +      band = band_translate[i + 1];
   1.270 +      t0 = tokens[next][0].token;
   1.271 +      t1 = tokens[next][1].token;
   1.272 +      /* Update the cost of each path if we're past the EOB token. */
   1.273 +      if (t0 != DCT_EOB_TOKEN) {
   1.274 +        tokens[next][0].rate +=
   1.275 +            mb->token_costs[tx_size][type][ref][band][1][0][t0];
   1.276 +        tokens[next][0].token = ZERO_TOKEN;
   1.277 +      }
   1.278 +      if (t1 != DCT_EOB_TOKEN) {
   1.279 +        tokens[next][1].rate +=
   1.280 +            mb->token_costs[tx_size][type][ref][band][1][0][t1];
   1.281 +        tokens[next][1].token = ZERO_TOKEN;
   1.282 +      }
   1.283 +      best_index[i][0] = best_index[i][1] = 0;
   1.284 +      /* Don't update next, because we didn't add a new node. */
   1.285 +    }
   1.286 +  }
   1.287 +
   1.288 +  /* Now pick the best path through the whole trellis. */
   1.289 +  band = band_translate[i + 1];
   1.290 +  pt = combine_entropy_contexts(*a, *l);
   1.291 +  rate0 = tokens[next][0].rate;
   1.292 +  rate1 = tokens[next][1].rate;
   1.293 +  error0 = tokens[next][0].error;
   1.294 +  error1 = tokens[next][1].error;
   1.295 +  t0 = tokens[next][0].token;
   1.296 +  t1 = tokens[next][1].token;
   1.297 +  rate0 += mb->token_costs[tx_size][type][ref][band][0][pt][t0];
   1.298 +  rate1 += mb->token_costs[tx_size][type][ref][band][0][pt][t1];
   1.299 +  UPDATE_RD_COST();
   1.300 +  best = rd_cost1 < rd_cost0;
   1.301 +  final_eob = i0 - 1;
   1.302 +  vpx_memset(qcoeff_ptr, 0, sizeof(*qcoeff_ptr) * (16 << (tx_size * 2)));
   1.303 +  vpx_memset(dqcoeff_ptr, 0, sizeof(*dqcoeff_ptr) * (16 << (tx_size * 2)));
   1.304 +  for (i = next; i < eob; i = next) {
   1.305 +    x = tokens[i][best].qc;
   1.306 +    if (x) {
   1.307 +      final_eob = i;
   1.308 +    }
   1.309 +    rc = scan[i];
   1.310 +    qcoeff_ptr[rc] = x;
   1.311 +    dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]) / mul;
   1.312 +
   1.313 +    next = tokens[i][best].next;
   1.314 +    best = best_index[i][best];
   1.315 +  }
   1.316 +  final_eob++;
   1.317 +
   1.318 +  xd->plane[plane].eobs[block] = final_eob;
   1.319 +  *a = *l = (final_eob > 0);
   1.320 +}
   1.321 +
   1.322 +void vp9_optimize_b(int plane, int block, BLOCK_SIZE plane_bsize,
   1.323 +                    TX_SIZE tx_size, MACROBLOCK *mb, struct optimize_ctx *ctx) {
   1.324 +  int x, y;
   1.325 +  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
   1.326 +  optimize_b(mb, plane, block, plane_bsize,
   1.327 +             &ctx->ta[plane][x], &ctx->tl[plane][y], tx_size);
   1.328 +}
   1.329 +
   1.330 +static void optimize_init_b(int plane, BLOCK_SIZE bsize,
   1.331 +                            struct encode_b_args *args) {
   1.332 +  const MACROBLOCKD *xd = &args->x->e_mbd;
   1.333 +  const struct macroblockd_plane* const pd = &xd->plane[plane];
   1.334 +  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
   1.335 +  const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
   1.336 +  const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
   1.337 +  const MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   1.338 +  const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size;
   1.339 +
   1.340 +  vp9_get_entropy_contexts(tx_size, args->ctx->ta[plane], args->ctx->tl[plane],
   1.341 +                           pd->above_context, pd->left_context,
   1.342 +                           num_4x4_w, num_4x4_h);
   1.343 +}
   1.344 +
   1.345 +void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize,
   1.346 +                     TX_SIZE tx_size, void *arg) {
   1.347 +  struct encode_b_args* const args = arg;
   1.348 +  MACROBLOCK* const x = args->x;
   1.349 +  MACROBLOCKD* const xd = &x->e_mbd;
   1.350 +  struct macroblock_plane *const p = &x->plane[plane];
   1.351 +  struct macroblockd_plane *const pd = &xd->plane[plane];
   1.352 +  int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
   1.353 +  int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
   1.354 +  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   1.355 +  const int16_t *scan, *iscan;
   1.356 +  uint16_t *eob = &pd->eobs[block];
   1.357 +  const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl;
   1.358 +  const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
   1.359 +  int xoff, yoff;
   1.360 +  int16_t *src_diff;
   1.361 +
   1.362 +  switch (tx_size) {
   1.363 +    case TX_32X32:
   1.364 +      scan = vp9_default_scan_32x32;
   1.365 +      iscan = vp9_default_iscan_32x32;
   1.366 +      block >>= 6;
   1.367 +      xoff = 32 * (block & twmask);
   1.368 +      yoff = 32 * (block >> twl);
   1.369 +      src_diff = p->src_diff + 4 * bw * yoff + xoff;
   1.370 +      if (x->use_lp32x32fdct)
   1.371 +        vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
   1.372 +      else
   1.373 +        vp9_fdct32x32(src_diff, coeff, bw * 4);
   1.374 +      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
   1.375 +                           p->quant, p->quant_shift, qcoeff, dqcoeff,
   1.376 +                           pd->dequant, p->zbin_extra, eob, scan, iscan);
   1.377 +      break;
   1.378 +    case TX_16X16:
   1.379 +      scan = vp9_default_scan_16x16;
   1.380 +      iscan = vp9_default_iscan_16x16;
   1.381 +      block >>= 4;
   1.382 +      xoff = 16 * (block & twmask);
   1.383 +      yoff = 16 * (block >> twl);
   1.384 +      src_diff = p->src_diff + 4 * bw * yoff + xoff;
   1.385 +      vp9_fdct16x16(src_diff, coeff, bw * 4);
   1.386 +      vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
   1.387 +                     p->quant, p->quant_shift, qcoeff, dqcoeff,
   1.388 +                     pd->dequant, p->zbin_extra, eob, scan, iscan);
   1.389 +      break;
   1.390 +    case TX_8X8:
   1.391 +      scan = vp9_default_scan_8x8;
   1.392 +      iscan = vp9_default_iscan_8x8;
   1.393 +      block >>= 2;
   1.394 +      xoff = 8 * (block & twmask);
   1.395 +      yoff = 8 * (block >> twl);
   1.396 +      src_diff = p->src_diff + 4 * bw * yoff + xoff;
   1.397 +      vp9_fdct8x8(src_diff, coeff, bw * 4);
   1.398 +      vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
   1.399 +                     p->quant, p->quant_shift, qcoeff, dqcoeff,
   1.400 +                     pd->dequant, p->zbin_extra, eob, scan, iscan);
   1.401 +      break;
   1.402 +    case TX_4X4:
   1.403 +      scan = vp9_default_scan_4x4;
   1.404 +      iscan = vp9_default_iscan_4x4;
   1.405 +      xoff = 4 * (block & twmask);
   1.406 +      yoff = 4 * (block >> twl);
   1.407 +      src_diff = p->src_diff + 4 * bw * yoff + xoff;
   1.408 +      x->fwd_txm4x4(src_diff, coeff, bw * 4);
   1.409 +      vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
   1.410 +                     p->quant, p->quant_shift, qcoeff, dqcoeff,
   1.411 +                     pd->dequant, p->zbin_extra, eob, scan, iscan);
   1.412 +      break;
   1.413 +    default:
   1.414 +      assert(0);
   1.415 +  }
   1.416 +}
   1.417 +
   1.418 +static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
   1.419 +                         TX_SIZE tx_size, void *arg) {
   1.420 +  struct encode_b_args *const args = arg;
   1.421 +  MACROBLOCK *const x = args->x;
   1.422 +  MACROBLOCKD *const xd = &x->e_mbd;
   1.423 +  struct optimize_ctx *const ctx = args->ctx;
   1.424 +  struct macroblockd_plane *const pd = &xd->plane[plane];
   1.425 +  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   1.426 +  int i, j;
   1.427 +  uint8_t *dst;
   1.428 +  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   1.429 +  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
   1.430 +
   1.431 +  // TODO(jingning): per transformed block zero forcing only enabled for
   1.432 +  // luma component. will integrate chroma components as well.
   1.433 +  if (x->zcoeff_blk[tx_size][block] && plane == 0) {
   1.434 +    pd->eobs[block] = 0;
   1.435 +    ctx->ta[plane][i] = 0;
   1.436 +    ctx->tl[plane][j] = 0;
   1.437 +    return;
   1.438 +  }
   1.439 +
   1.440 +  if (!x->skip_recode)
   1.441 +    vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
   1.442 +
   1.443 +  if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
   1.444 +    vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
   1.445 +  } else {
   1.446 +    ctx->ta[plane][i] = pd->eobs[block] > 0;
   1.447 +    ctx->tl[plane][j] = pd->eobs[block] > 0;
   1.448 +  }
   1.449 +
   1.450 +  if (x->skip_encode || pd->eobs[block] == 0)
   1.451 +    return;
   1.452 +
   1.453 +  switch (tx_size) {
   1.454 +    case TX_32X32:
   1.455 +      vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
   1.456 +      break;
   1.457 +    case TX_16X16:
   1.458 +      vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
   1.459 +      break;
   1.460 +    case TX_8X8:
   1.461 +      vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
   1.462 +      break;
   1.463 +    case TX_4X4:
   1.464 +      // this is like vp9_short_idct4x4 but has a special case around eob<=1
   1.465 +      // which is significant (not just an optimization) for the lossless
   1.466 +      // case.
   1.467 +      xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
   1.468 +      break;
   1.469 +    default:
   1.470 +      assert(!"Invalid transform size");
   1.471 +  }
   1.472 +}
   1.473 +
   1.474 +static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
   1.475 +                               TX_SIZE tx_size, void *arg) {
   1.476 +  struct encode_b_args *const args = arg;
   1.477 +  MACROBLOCK *const x = args->x;
   1.478 +  MACROBLOCKD *const xd = &x->e_mbd;
   1.479 +  struct macroblockd_plane *const pd = &xd->plane[plane];
   1.480 +  const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
   1.481 +                                                       block);
   1.482 +
   1.483 +  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   1.484 +  uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block,
   1.485 +                                                 pd->dst.buf, pd->dst.stride);
   1.486 +
   1.487 +  vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
   1.488 +
   1.489 +  if (pd->eobs[block] == 0)
   1.490 +    return;
   1.491 +
   1.492 +  xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]);
   1.493 +}
   1.494 +
   1.495 +void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) {
   1.496 +  MACROBLOCKD *const xd = &x->e_mbd;
   1.497 +  struct optimize_ctx ctx;
   1.498 +  struct encode_b_args arg = {x, &ctx};
   1.499 +
   1.500 +  vp9_subtract_sby(x, bsize);
   1.501 +  if (x->optimize)
   1.502 +    optimize_init_b(0, bsize, &arg);
   1.503 +
   1.504 +  foreach_transformed_block_in_plane(xd, bsize, 0, encode_block_pass1, &arg);
   1.505 +}
   1.506 +
   1.507 +void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   1.508 +  MACROBLOCKD *const xd = &x->e_mbd;
   1.509 +  struct optimize_ctx ctx;
   1.510 +  struct encode_b_args arg = {x, &ctx};
   1.511 +
   1.512 +  if (!x->skip_recode)
   1.513 +    vp9_subtract_sb(x, bsize);
   1.514 +
   1.515 +  if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
   1.516 +    int i;
   1.517 +    for (i = 0; i < MAX_MB_PLANE; ++i)
   1.518 +      optimize_init_b(i, bsize, &arg);
   1.519 +  }
   1.520 +
   1.521 +  foreach_transformed_block(xd, bsize, encode_block, &arg);
   1.522 +}
   1.523 +
   1.524 +void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
   1.525 +                            TX_SIZE tx_size, void *arg) {
   1.526 +  struct encode_b_args* const args = arg;
   1.527 +  MACROBLOCK *const x = args->x;
   1.528 +  MACROBLOCKD *const xd = &x->e_mbd;
   1.529 +  MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   1.530 +  struct macroblock_plane *const p = &x->plane[plane];
   1.531 +  struct macroblockd_plane *const pd = &xd->plane[plane];
   1.532 +  int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
   1.533 +  int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block);
   1.534 +  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   1.535 +  const int16_t *scan, *iscan;
   1.536 +  TX_TYPE tx_type;
   1.537 +  MB_PREDICTION_MODE mode;
   1.538 +  const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl;
   1.539 +  const int twl = bwl - tx_size, twmask = (1 << twl) - 1;
   1.540 +  int xoff, yoff;
   1.541 +  uint8_t *src, *dst;
   1.542 +  int16_t *src_diff;
   1.543 +  uint16_t *eob = &pd->eobs[block];
   1.544 +
   1.545 +  if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0)
   1.546 +    extend_for_intra(xd, plane_bsize, plane, block, tx_size);
   1.547 +
   1.548 +  // if (x->optimize)
   1.549 +  // vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx);
   1.550 +
   1.551 +  switch (tx_size) {
   1.552 +    case TX_32X32:
   1.553 +      scan = vp9_default_scan_32x32;
   1.554 +      iscan = vp9_default_iscan_32x32;
   1.555 +      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
   1.556 +      block >>= 6;
   1.557 +      xoff = 32 * (block & twmask);
   1.558 +      yoff = 32 * (block >> twl);
   1.559 +      dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
   1.560 +      vp9_predict_intra_block(xd, block, bwl, TX_32X32, mode,
   1.561 +                              dst, pd->dst.stride, dst, pd->dst.stride);
   1.562 +
   1.563 +      if (!x->skip_recode) {
   1.564 +        src = p->src.buf + yoff * p->src.stride + xoff;
   1.565 +        src_diff = p->src_diff + 4 * bw * yoff + xoff;
   1.566 +        vp9_subtract_block(32, 32, src_diff, bw * 4,
   1.567 +                           src, p->src.stride, dst, pd->dst.stride);
   1.568 +        if (x->use_lp32x32fdct)
   1.569 +          vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
   1.570 +        else
   1.571 +          vp9_fdct32x32(src_diff, coeff, bw * 4);
   1.572 +        vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
   1.573 +                             p->quant, p->quant_shift, qcoeff, dqcoeff,
   1.574 +                             pd->dequant, p->zbin_extra, eob, scan, iscan);
   1.575 +      }
   1.576 +      if (!x->skip_encode && *eob)
   1.577 +        vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, *eob);
   1.578 +      break;
   1.579 +    case TX_16X16:
   1.580 +      tx_type = get_tx_type_16x16(pd->plane_type, xd);
   1.581 +      scan = get_scan_16x16(tx_type);
   1.582 +      iscan = get_iscan_16x16(tx_type);
   1.583 +      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
   1.584 +      block >>= 4;
   1.585 +      xoff = 16 * (block & twmask);
   1.586 +      yoff = 16 * (block >> twl);
   1.587 +      dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
   1.588 +      vp9_predict_intra_block(xd, block, bwl, TX_16X16, mode,
   1.589 +                              dst, pd->dst.stride, dst, pd->dst.stride);
   1.590 +      if (!x->skip_recode) {
   1.591 +        src = p->src.buf + yoff * p->src.stride + xoff;
   1.592 +        src_diff = p->src_diff + 4 * bw * yoff + xoff;
   1.593 +        vp9_subtract_block(16, 16, src_diff, bw * 4,
   1.594 +                           src, p->src.stride, dst, pd->dst.stride);
   1.595 +        vp9_fht16x16(tx_type, src_diff, coeff, bw * 4);
   1.596 +        vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
   1.597 +                       p->quant, p->quant_shift, qcoeff, dqcoeff,
   1.598 +                       pd->dequant, p->zbin_extra, eob, scan, iscan);
   1.599 +      }
   1.600 +      if (!x->skip_encode && *eob)
   1.601 +        vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
   1.602 +      break;
   1.603 +    case TX_8X8:
   1.604 +      tx_type = get_tx_type_8x8(pd->plane_type, xd);
   1.605 +      scan = get_scan_8x8(tx_type);
   1.606 +      iscan = get_iscan_8x8(tx_type);
   1.607 +      mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
   1.608 +      block >>= 2;
   1.609 +      xoff = 8 * (block & twmask);
   1.610 +      yoff = 8 * (block >> twl);
   1.611 +      dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
   1.612 +      vp9_predict_intra_block(xd, block, bwl, TX_8X8, mode,
   1.613 +                              dst, pd->dst.stride, dst, pd->dst.stride);
   1.614 +      if (!x->skip_recode) {
   1.615 +        src = p->src.buf + yoff * p->src.stride + xoff;
   1.616 +        src_diff = p->src_diff + 4 * bw * yoff + xoff;
   1.617 +        vp9_subtract_block(8, 8, src_diff, bw * 4,
   1.618 +                           src, p->src.stride, dst, pd->dst.stride);
   1.619 +        vp9_fht8x8(tx_type, src_diff, coeff, bw * 4);
   1.620 +        vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
   1.621 +                       p->quant_shift, qcoeff, dqcoeff,
   1.622 +                       pd->dequant, p->zbin_extra, eob, scan, iscan);
   1.623 +      }
   1.624 +      if (!x->skip_encode && *eob)
   1.625 +        vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
   1.626 +      break;
   1.627 +    case TX_4X4:
   1.628 +      tx_type = get_tx_type_4x4(pd->plane_type, xd, block);
   1.629 +      scan = get_scan_4x4(tx_type);
   1.630 +      iscan = get_iscan_4x4(tx_type);
   1.631 +      if (mbmi->sb_type < BLOCK_8X8 && plane == 0)
   1.632 +        mode = xd->mi_8x8[0]->bmi[block].as_mode;
   1.633 +      else
   1.634 +        mode = plane == 0 ? mbmi->mode : mbmi->uv_mode;
   1.635 +
   1.636 +      xoff = 4 * (block & twmask);
   1.637 +      yoff = 4 * (block >> twl);
   1.638 +      dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
   1.639 +      vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
   1.640 +                              dst, pd->dst.stride, dst, pd->dst.stride);
   1.641 +
   1.642 +      if (!x->skip_recode) {
   1.643 +        src = p->src.buf + yoff * p->src.stride + xoff;
   1.644 +        src_diff = p->src_diff + 4 * bw * yoff + xoff;
   1.645 +        vp9_subtract_block(4, 4, src_diff, bw * 4,
   1.646 +                           src, p->src.stride, dst, pd->dst.stride);
   1.647 +        if (tx_type != DCT_DCT)
   1.648 +          vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type);
   1.649 +        else
   1.650 +          x->fwd_txm4x4(src_diff, coeff, bw * 4);
   1.651 +        vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
   1.652 +                       p->quant_shift, qcoeff, dqcoeff,
   1.653 +                       pd->dequant, p->zbin_extra, eob, scan, iscan);
   1.654 +      }
   1.655 +
   1.656 +      if (!x->skip_encode && *eob) {
   1.657 +        if (tx_type == DCT_DCT)
   1.658 +          // this is like vp9_short_idct4x4 but has a special case around eob<=1
   1.659 +          // which is significant (not just an optimization) for the lossless
   1.660 +          // case.
   1.661 +          xd->itxm_add(dqcoeff, dst, pd->dst.stride, *eob);
   1.662 +        else
   1.663 +          vp9_iht4x4_16_add(dqcoeff, dst, pd->dst.stride, tx_type);
   1.664 +      }
   1.665 +      break;
   1.666 +    default:
   1.667 +      assert(0);
   1.668 +  }
   1.669 +}
   1.670 +
   1.671 +void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize) {
   1.672 +  MACROBLOCKD* const xd = &x->e_mbd;
   1.673 +  struct optimize_ctx ctx;
   1.674 +  struct encode_b_args arg = {x, &ctx};
   1.675 +
   1.676 +  foreach_transformed_block_in_plane(xd, bsize, 0, vp9_encode_block_intra,
   1.677 +                                     &arg);
   1.678 +}
   1.679 +void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize) {
   1.680 +  MACROBLOCKD* const xd = &x->e_mbd;
   1.681 +  struct optimize_ctx ctx;
   1.682 +  struct encode_b_args arg = {x, &ctx};
   1.683 +  foreach_transformed_block_uv(xd, bsize, vp9_encode_block_intra, &arg);
   1.684 +}
   1.685 +

mercurial