1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/encoder/vp9_encodemb.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,682 @@ 1.4 +/* 1.5 + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 + 1.15 +#include "./vp9_rtcd.h" 1.16 +#include "./vpx_config.h" 1.17 + 1.18 +#include "vpx_mem/vpx_mem.h" 1.19 + 1.20 +#include "vp9/common/vp9_idct.h" 1.21 +#include "vp9/common/vp9_reconinter.h" 1.22 +#include "vp9/common/vp9_reconintra.h" 1.23 +#include "vp9/common/vp9_systemdependent.h" 1.24 + 1.25 +#include "vp9/encoder/vp9_dct.h" 1.26 +#include "vp9/encoder/vp9_encodemb.h" 1.27 +#include "vp9/encoder/vp9_quantize.h" 1.28 +#include "vp9/encoder/vp9_rdopt.h" 1.29 +#include "vp9/encoder/vp9_tokenize.h" 1.30 + 1.31 +void vp9_subtract_block_c(int rows, int cols, 1.32 + int16_t *diff_ptr, ptrdiff_t diff_stride, 1.33 + const uint8_t *src_ptr, ptrdiff_t src_stride, 1.34 + const uint8_t *pred_ptr, ptrdiff_t pred_stride) { 1.35 + int r, c; 1.36 + 1.37 + for (r = 0; r < rows; r++) { 1.38 + for (c = 0; c < cols; c++) 1.39 + diff_ptr[c] = src_ptr[c] - pred_ptr[c]; 1.40 + 1.41 + diff_ptr += diff_stride; 1.42 + pred_ptr += pred_stride; 1.43 + src_ptr += src_stride; 1.44 + } 1.45 +} 1.46 + 1.47 +static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { 1.48 + struct macroblock_plane *const p = &x->plane[plane]; 1.49 + const MACROBLOCKD *const xd = &x->e_mbd; 1.50 + const struct macroblockd_plane *const pd = &xd->plane[plane]; 1.51 + const int bw = plane_block_width(bsize, pd); 1.52 + const int bh = plane_block_height(bsize, pd); 1.53 + 1.54 + vp9_subtract_block(bh, bw, p->src_diff, bw, 1.55 + p->src.buf, p->src.stride, 1.56 + pd->dst.buf, pd->dst.stride); 1.57 +} 1.58 + 1.59 +void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE bsize) { 1.60 + subtract_plane(x, bsize, 0); 1.61 +} 1.62 + 1.63 +void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize) { 1.64 + int i; 1.65 + 1.66 + for (i = 1; i < MAX_MB_PLANE; i++) 1.67 + subtract_plane(x, bsize, i); 1.68 +} 1.69 + 1.70 +void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { 1.71 + vp9_subtract_sby(x, bsize); 1.72 + vp9_subtract_sbuv(x, bsize); 1.73 +} 1.74 + 1.75 +#define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF) 1.76 +typedef struct vp9_token_state vp9_token_state; 1.77 + 1.78 +struct vp9_token_state { 1.79 + int rate; 1.80 + int error; 1.81 + int next; 1.82 + signed char token; 1.83 + short qc; 1.84 +}; 1.85 + 1.86 +// TODO(jimbankoski): experiment to find optimal RD numbers. 1.87 +#define Y1_RD_MULT 4 1.88 +#define UV_RD_MULT 2 1.89 + 1.90 +static const int plane_rd_mult[4] = { 1.91 + Y1_RD_MULT, 1.92 + UV_RD_MULT, 1.93 +}; 1.94 + 1.95 +#define UPDATE_RD_COST()\ 1.96 +{\ 1.97 + rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\ 1.98 + rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\ 1.99 + if (rd_cost0 == rd_cost1) {\ 1.100 + rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\ 1.101 + rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\ 1.102 + }\ 1.103 +} 1.104 + 1.105 +// This function is a place holder for now but may ultimately need 1.106 +// to scan previous tokens to work out the correct context. 1.107 +static int trellis_get_coeff_context(const int16_t *scan, 1.108 + const int16_t *nb, 1.109 + int idx, int token, 1.110 + uint8_t *token_cache) { 1.111 + int bak = token_cache[scan[idx]], pt; 1.112 + token_cache[scan[idx]] = vp9_pt_energy_class[token]; 1.113 + pt = get_coef_context(nb, token_cache, idx + 1); 1.114 + token_cache[scan[idx]] = bak; 1.115 + return pt; 1.116 +} 1.117 + 1.118 +static void optimize_b(MACROBLOCK *mb, 1.119 + int plane, int block, BLOCK_SIZE plane_bsize, 1.120 + ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, 1.121 + TX_SIZE tx_size) { 1.122 + MACROBLOCKD *const xd = &mb->e_mbd; 1.123 + struct macroblockd_plane *pd = &xd->plane[plane]; 1.124 + const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi); 1.125 + vp9_token_state tokens[1025][2]; 1.126 + unsigned best_index[1025][2]; 1.127 + const int16_t *coeff_ptr = BLOCK_OFFSET(mb->plane[plane].coeff, block); 1.128 + int16_t *qcoeff_ptr; 1.129 + int16_t *dqcoeff_ptr; 1.130 + int eob = pd->eobs[block], final_eob, sz = 0; 1.131 + const int i0 = 0; 1.132 + int rc, x, next, i; 1.133 + int64_t rdmult, rddiv, rd_cost0, rd_cost1; 1.134 + int rate0, rate1, error0, error1, t0, t1; 1.135 + int best, band, pt; 1.136 + PLANE_TYPE type = pd->plane_type; 1.137 + int err_mult = plane_rd_mult[type]; 1.138 + const int default_eob = 16 << (tx_size << 1); 1.139 + const int16_t *scan, *nb; 1.140 + const int mul = 1 + (tx_size == TX_32X32); 1.141 + uint8_t token_cache[1024]; 1.142 + const int16_t *dequant_ptr = pd->dequant; 1.143 + const uint8_t *const band_translate = get_band_translate(tx_size); 1.144 + 1.145 + assert((!type && !plane) || (type && plane)); 1.146 + dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block); 1.147 + qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block); 1.148 + get_scan(xd, tx_size, type, block, &scan, &nb); 1.149 + assert(eob <= default_eob); 1.150 + 1.151 + /* Now set up a Viterbi trellis to evaluate alternative roundings. */ 1.152 + rdmult = mb->rdmult * err_mult; 1.153 + if (mb->e_mbd.mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME) 1.154 + rdmult = (rdmult * 9) >> 4; 1.155 + rddiv = mb->rddiv; 1.156 + /* Initialize the sentinel node of the trellis. */ 1.157 + tokens[eob][0].rate = 0; 1.158 + tokens[eob][0].error = 0; 1.159 + tokens[eob][0].next = default_eob; 1.160 + tokens[eob][0].token = DCT_EOB_TOKEN; 1.161 + tokens[eob][0].qc = 0; 1.162 + *(tokens[eob] + 1) = *(tokens[eob] + 0); 1.163 + next = eob; 1.164 + for (i = 0; i < eob; i++) 1.165 + token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[ 1.166 + qcoeff_ptr[scan[i]]].token]; 1.167 + 1.168 + for (i = eob; i-- > i0;) { 1.169 + int base_bits, d2, dx; 1.170 + 1.171 + rc = scan[i]; 1.172 + x = qcoeff_ptr[rc]; 1.173 + /* Only add a trellis state for non-zero coefficients. */ 1.174 + if (x) { 1.175 + int shortcut = 0; 1.176 + error0 = tokens[next][0].error; 1.177 + error1 = tokens[next][1].error; 1.178 + /* Evaluate the first possibility for this state. */ 1.179 + rate0 = tokens[next][0].rate; 1.180 + rate1 = tokens[next][1].rate; 1.181 + t0 = (vp9_dct_value_tokens_ptr + x)->token; 1.182 + /* Consider both possible successor states. */ 1.183 + if (next < default_eob) { 1.184 + band = band_translate[i + 1]; 1.185 + pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); 1.186 + rate0 += 1.187 + mb->token_costs[tx_size][type][ref][band][0][pt] 1.188 + [tokens[next][0].token]; 1.189 + rate1 += 1.190 + mb->token_costs[tx_size][type][ref][band][0][pt] 1.191 + [tokens[next][1].token]; 1.192 + } 1.193 + UPDATE_RD_COST(); 1.194 + /* And pick the best. */ 1.195 + best = rd_cost1 < rd_cost0; 1.196 + base_bits = *(vp9_dct_value_cost_ptr + x); 1.197 + dx = mul * (dqcoeff_ptr[rc] - coeff_ptr[rc]); 1.198 + d2 = dx * dx; 1.199 + tokens[i][0].rate = base_bits + (best ? rate1 : rate0); 1.200 + tokens[i][0].error = d2 + (best ? error1 : error0); 1.201 + tokens[i][0].next = next; 1.202 + tokens[i][0].token = t0; 1.203 + tokens[i][0].qc = x; 1.204 + best_index[i][0] = best; 1.205 + 1.206 + /* Evaluate the second possibility for this state. */ 1.207 + rate0 = tokens[next][0].rate; 1.208 + rate1 = tokens[next][1].rate; 1.209 + 1.210 + if ((abs(x)*dequant_ptr[rc != 0] > abs(coeff_ptr[rc]) * mul) && 1.211 + (abs(x)*dequant_ptr[rc != 0] < abs(coeff_ptr[rc]) * mul + 1.212 + dequant_ptr[rc != 0])) 1.213 + shortcut = 1; 1.214 + else 1.215 + shortcut = 0; 1.216 + 1.217 + if (shortcut) { 1.218 + sz = -(x < 0); 1.219 + x -= 2 * sz + 1; 1.220 + } 1.221 + 1.222 + /* Consider both possible successor states. */ 1.223 + if (!x) { 1.224 + /* If we reduced this coefficient to zero, check to see if 1.225 + * we need to move the EOB back here. 1.226 + */ 1.227 + t0 = tokens[next][0].token == DCT_EOB_TOKEN ? 1.228 + DCT_EOB_TOKEN : ZERO_TOKEN; 1.229 + t1 = tokens[next][1].token == DCT_EOB_TOKEN ? 1.230 + DCT_EOB_TOKEN : ZERO_TOKEN; 1.231 + } else { 1.232 + t0 = t1 = (vp9_dct_value_tokens_ptr + x)->token; 1.233 + } 1.234 + if (next < default_eob) { 1.235 + band = band_translate[i + 1]; 1.236 + if (t0 != DCT_EOB_TOKEN) { 1.237 + pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); 1.238 + rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt] 1.239 + [tokens[next][0].token]; 1.240 + } 1.241 + if (t1 != DCT_EOB_TOKEN) { 1.242 + pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache); 1.243 + rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt] 1.244 + [tokens[next][1].token]; 1.245 + } 1.246 + } 1.247 + 1.248 + UPDATE_RD_COST(); 1.249 + /* And pick the best. */ 1.250 + best = rd_cost1 < rd_cost0; 1.251 + base_bits = *(vp9_dct_value_cost_ptr + x); 1.252 + 1.253 + if (shortcut) { 1.254 + dx -= (dequant_ptr[rc != 0] + sz) ^ sz; 1.255 + d2 = dx * dx; 1.256 + } 1.257 + tokens[i][1].rate = base_bits + (best ? rate1 : rate0); 1.258 + tokens[i][1].error = d2 + (best ? error1 : error0); 1.259 + tokens[i][1].next = next; 1.260 + tokens[i][1].token = best ? t1 : t0; 1.261 + tokens[i][1].qc = x; 1.262 + best_index[i][1] = best; 1.263 + /* Finally, make this the new head of the trellis. */ 1.264 + next = i; 1.265 + } else { 1.266 + /* There's no choice to make for a zero coefficient, so we don't 1.267 + * add a new trellis node, but we do need to update the costs. 1.268 + */ 1.269 + band = band_translate[i + 1]; 1.270 + t0 = tokens[next][0].token; 1.271 + t1 = tokens[next][1].token; 1.272 + /* Update the cost of each path if we're past the EOB token. */ 1.273 + if (t0 != DCT_EOB_TOKEN) { 1.274 + tokens[next][0].rate += 1.275 + mb->token_costs[tx_size][type][ref][band][1][0][t0]; 1.276 + tokens[next][0].token = ZERO_TOKEN; 1.277 + } 1.278 + if (t1 != DCT_EOB_TOKEN) { 1.279 + tokens[next][1].rate += 1.280 + mb->token_costs[tx_size][type][ref][band][1][0][t1]; 1.281 + tokens[next][1].token = ZERO_TOKEN; 1.282 + } 1.283 + best_index[i][0] = best_index[i][1] = 0; 1.284 + /* Don't update next, because we didn't add a new node. */ 1.285 + } 1.286 + } 1.287 + 1.288 + /* Now pick the best path through the whole trellis. */ 1.289 + band = band_translate[i + 1]; 1.290 + pt = combine_entropy_contexts(*a, *l); 1.291 + rate0 = tokens[next][0].rate; 1.292 + rate1 = tokens[next][1].rate; 1.293 + error0 = tokens[next][0].error; 1.294 + error1 = tokens[next][1].error; 1.295 + t0 = tokens[next][0].token; 1.296 + t1 = tokens[next][1].token; 1.297 + rate0 += mb->token_costs[tx_size][type][ref][band][0][pt][t0]; 1.298 + rate1 += mb->token_costs[tx_size][type][ref][band][0][pt][t1]; 1.299 + UPDATE_RD_COST(); 1.300 + best = rd_cost1 < rd_cost0; 1.301 + final_eob = i0 - 1; 1.302 + vpx_memset(qcoeff_ptr, 0, sizeof(*qcoeff_ptr) * (16 << (tx_size * 2))); 1.303 + vpx_memset(dqcoeff_ptr, 0, sizeof(*dqcoeff_ptr) * (16 << (tx_size * 2))); 1.304 + for (i = next; i < eob; i = next) { 1.305 + x = tokens[i][best].qc; 1.306 + if (x) { 1.307 + final_eob = i; 1.308 + } 1.309 + rc = scan[i]; 1.310 + qcoeff_ptr[rc] = x; 1.311 + dqcoeff_ptr[rc] = (x * dequant_ptr[rc != 0]) / mul; 1.312 + 1.313 + next = tokens[i][best].next; 1.314 + best = best_index[i][best]; 1.315 + } 1.316 + final_eob++; 1.317 + 1.318 + xd->plane[plane].eobs[block] = final_eob; 1.319 + *a = *l = (final_eob > 0); 1.320 +} 1.321 + 1.322 +void vp9_optimize_b(int plane, int block, BLOCK_SIZE plane_bsize, 1.323 + TX_SIZE tx_size, MACROBLOCK *mb, struct optimize_ctx *ctx) { 1.324 + int x, y; 1.325 + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y); 1.326 + optimize_b(mb, plane, block, plane_bsize, 1.327 + &ctx->ta[plane][x], &ctx->tl[plane][y], tx_size); 1.328 +} 1.329 + 1.330 +static void optimize_init_b(int plane, BLOCK_SIZE bsize, 1.331 + struct encode_b_args *args) { 1.332 + const MACROBLOCKD *xd = &args->x->e_mbd; 1.333 + const struct macroblockd_plane* const pd = &xd->plane[plane]; 1.334 + const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd); 1.335 + const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize]; 1.336 + const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize]; 1.337 + const MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; 1.338 + const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi) : mbmi->tx_size; 1.339 + 1.340 + vp9_get_entropy_contexts(tx_size, args->ctx->ta[plane], args->ctx->tl[plane], 1.341 + pd->above_context, pd->left_context, 1.342 + num_4x4_w, num_4x4_h); 1.343 +} 1.344 + 1.345 +void vp9_xform_quant(int plane, int block, BLOCK_SIZE plane_bsize, 1.346 + TX_SIZE tx_size, void *arg) { 1.347 + struct encode_b_args* const args = arg; 1.348 + MACROBLOCK* const x = args->x; 1.349 + MACROBLOCKD* const xd = &x->e_mbd; 1.350 + struct macroblock_plane *const p = &x->plane[plane]; 1.351 + struct macroblockd_plane *const pd = &xd->plane[plane]; 1.352 + int16_t *coeff = BLOCK_OFFSET(p->coeff, block); 1.353 + int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block); 1.354 + int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); 1.355 + const int16_t *scan, *iscan; 1.356 + uint16_t *eob = &pd->eobs[block]; 1.357 + const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl; 1.358 + const int twl = bwl - tx_size, twmask = (1 << twl) - 1; 1.359 + int xoff, yoff; 1.360 + int16_t *src_diff; 1.361 + 1.362 + switch (tx_size) { 1.363 + case TX_32X32: 1.364 + scan = vp9_default_scan_32x32; 1.365 + iscan = vp9_default_iscan_32x32; 1.366 + block >>= 6; 1.367 + xoff = 32 * (block & twmask); 1.368 + yoff = 32 * (block >> twl); 1.369 + src_diff = p->src_diff + 4 * bw * yoff + xoff; 1.370 + if (x->use_lp32x32fdct) 1.371 + vp9_fdct32x32_rd(src_diff, coeff, bw * 4); 1.372 + else 1.373 + vp9_fdct32x32(src_diff, coeff, bw * 4); 1.374 + vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, 1.375 + p->quant, p->quant_shift, qcoeff, dqcoeff, 1.376 + pd->dequant, p->zbin_extra, eob, scan, iscan); 1.377 + break; 1.378 + case TX_16X16: 1.379 + scan = vp9_default_scan_16x16; 1.380 + iscan = vp9_default_iscan_16x16; 1.381 + block >>= 4; 1.382 + xoff = 16 * (block & twmask); 1.383 + yoff = 16 * (block >> twl); 1.384 + src_diff = p->src_diff + 4 * bw * yoff + xoff; 1.385 + vp9_fdct16x16(src_diff, coeff, bw * 4); 1.386 + vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, 1.387 + p->quant, p->quant_shift, qcoeff, dqcoeff, 1.388 + pd->dequant, p->zbin_extra, eob, scan, iscan); 1.389 + break; 1.390 + case TX_8X8: 1.391 + scan = vp9_default_scan_8x8; 1.392 + iscan = vp9_default_iscan_8x8; 1.393 + block >>= 2; 1.394 + xoff = 8 * (block & twmask); 1.395 + yoff = 8 * (block >> twl); 1.396 + src_diff = p->src_diff + 4 * bw * yoff + xoff; 1.397 + vp9_fdct8x8(src_diff, coeff, bw * 4); 1.398 + vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, 1.399 + p->quant, p->quant_shift, qcoeff, dqcoeff, 1.400 + pd->dequant, p->zbin_extra, eob, scan, iscan); 1.401 + break; 1.402 + case TX_4X4: 1.403 + scan = vp9_default_scan_4x4; 1.404 + iscan = vp9_default_iscan_4x4; 1.405 + xoff = 4 * (block & twmask); 1.406 + yoff = 4 * (block >> twl); 1.407 + src_diff = p->src_diff + 4 * bw * yoff + xoff; 1.408 + x->fwd_txm4x4(src_diff, coeff, bw * 4); 1.409 + vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, 1.410 + p->quant, p->quant_shift, qcoeff, dqcoeff, 1.411 + pd->dequant, p->zbin_extra, eob, scan, iscan); 1.412 + break; 1.413 + default: 1.414 + assert(0); 1.415 + } 1.416 +} 1.417 + 1.418 +static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize, 1.419 + TX_SIZE tx_size, void *arg) { 1.420 + struct encode_b_args *const args = arg; 1.421 + MACROBLOCK *const x = args->x; 1.422 + MACROBLOCKD *const xd = &x->e_mbd; 1.423 + struct optimize_ctx *const ctx = args->ctx; 1.424 + struct macroblockd_plane *const pd = &xd->plane[plane]; 1.425 + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); 1.426 + int i, j; 1.427 + uint8_t *dst; 1.428 + txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j); 1.429 + dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i]; 1.430 + 1.431 + // TODO(jingning): per transformed block zero forcing only enabled for 1.432 + // luma component. will integrate chroma components as well. 1.433 + if (x->zcoeff_blk[tx_size][block] && plane == 0) { 1.434 + pd->eobs[block] = 0; 1.435 + ctx->ta[plane][i] = 0; 1.436 + ctx->tl[plane][j] = 0; 1.437 + return; 1.438 + } 1.439 + 1.440 + if (!x->skip_recode) 1.441 + vp9_xform_quant(plane, block, plane_bsize, tx_size, arg); 1.442 + 1.443 + if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { 1.444 + vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx); 1.445 + } else { 1.446 + ctx->ta[plane][i] = pd->eobs[block] > 0; 1.447 + ctx->tl[plane][j] = pd->eobs[block] > 0; 1.448 + } 1.449 + 1.450 + if (x->skip_encode || pd->eobs[block] == 0) 1.451 + return; 1.452 + 1.453 + switch (tx_size) { 1.454 + case TX_32X32: 1.455 + vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]); 1.456 + break; 1.457 + case TX_16X16: 1.458 + vp9_idct16x16_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]); 1.459 + break; 1.460 + case TX_8X8: 1.461 + vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]); 1.462 + break; 1.463 + case TX_4X4: 1.464 + // this is like vp9_short_idct4x4 but has a special case around eob<=1 1.465 + // which is significant (not just an optimization) for the lossless 1.466 + // case. 1.467 + xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]); 1.468 + break; 1.469 + default: 1.470 + assert(!"Invalid transform size"); 1.471 + } 1.472 +} 1.473 + 1.474 +static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize, 1.475 + TX_SIZE tx_size, void *arg) { 1.476 + struct encode_b_args *const args = arg; 1.477 + MACROBLOCK *const x = args->x; 1.478 + MACROBLOCKD *const xd = &x->e_mbd; 1.479 + struct macroblockd_plane *const pd = &xd->plane[plane]; 1.480 + const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size, 1.481 + block); 1.482 + 1.483 + int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); 1.484 + uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block, 1.485 + pd->dst.buf, pd->dst.stride); 1.486 + 1.487 + vp9_xform_quant(plane, block, plane_bsize, tx_size, arg); 1.488 + 1.489 + if (pd->eobs[block] == 0) 1.490 + return; 1.491 + 1.492 + xd->itxm_add(dqcoeff, dst, pd->dst.stride, pd->eobs[block]); 1.493 +} 1.494 + 1.495 +void vp9_encode_sby(MACROBLOCK *x, BLOCK_SIZE bsize) { 1.496 + MACROBLOCKD *const xd = &x->e_mbd; 1.497 + struct optimize_ctx ctx; 1.498 + struct encode_b_args arg = {x, &ctx}; 1.499 + 1.500 + vp9_subtract_sby(x, bsize); 1.501 + if (x->optimize) 1.502 + optimize_init_b(0, bsize, &arg); 1.503 + 1.504 + foreach_transformed_block_in_plane(xd, bsize, 0, encode_block_pass1, &arg); 1.505 +} 1.506 + 1.507 +void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) { 1.508 + MACROBLOCKD *const xd = &x->e_mbd; 1.509 + struct optimize_ctx ctx; 1.510 + struct encode_b_args arg = {x, &ctx}; 1.511 + 1.512 + if (!x->skip_recode) 1.513 + vp9_subtract_sb(x, bsize); 1.514 + 1.515 + if (x->optimize && (!x->skip_recode || !x->skip_optimize)) { 1.516 + int i; 1.517 + for (i = 0; i < MAX_MB_PLANE; ++i) 1.518 + optimize_init_b(i, bsize, &arg); 1.519 + } 1.520 + 1.521 + foreach_transformed_block(xd, bsize, encode_block, &arg); 1.522 +} 1.523 + 1.524 +void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, 1.525 + TX_SIZE tx_size, void *arg) { 1.526 + struct encode_b_args* const args = arg; 1.527 + MACROBLOCK *const x = args->x; 1.528 + MACROBLOCKD *const xd = &x->e_mbd; 1.529 + MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi; 1.530 + struct macroblock_plane *const p = &x->plane[plane]; 1.531 + struct macroblockd_plane *const pd = &xd->plane[plane]; 1.532 + int16_t *coeff = BLOCK_OFFSET(p->coeff, block); 1.533 + int16_t *qcoeff = BLOCK_OFFSET(pd->qcoeff, block); 1.534 + int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); 1.535 + const int16_t *scan, *iscan; 1.536 + TX_TYPE tx_type; 1.537 + MB_PREDICTION_MODE mode; 1.538 + const int bwl = b_width_log2(plane_bsize), bw = 1 << bwl; 1.539 + const int twl = bwl - tx_size, twmask = (1 << twl) - 1; 1.540 + int xoff, yoff; 1.541 + uint8_t *src, *dst; 1.542 + int16_t *src_diff; 1.543 + uint16_t *eob = &pd->eobs[block]; 1.544 + 1.545 + if (xd->mb_to_right_edge < 0 || xd->mb_to_bottom_edge < 0) 1.546 + extend_for_intra(xd, plane_bsize, plane, block, tx_size); 1.547 + 1.548 + // if (x->optimize) 1.549 + // vp9_optimize_b(plane, block, plane_bsize, tx_size, x, args->ctx); 1.550 + 1.551 + switch (tx_size) { 1.552 + case TX_32X32: 1.553 + scan = vp9_default_scan_32x32; 1.554 + iscan = vp9_default_iscan_32x32; 1.555 + mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; 1.556 + block >>= 6; 1.557 + xoff = 32 * (block & twmask); 1.558 + yoff = 32 * (block >> twl); 1.559 + dst = pd->dst.buf + yoff * pd->dst.stride + xoff; 1.560 + vp9_predict_intra_block(xd, block, bwl, TX_32X32, mode, 1.561 + dst, pd->dst.stride, dst, pd->dst.stride); 1.562 + 1.563 + if (!x->skip_recode) { 1.564 + src = p->src.buf + yoff * p->src.stride + xoff; 1.565 + src_diff = p->src_diff + 4 * bw * yoff + xoff; 1.566 + vp9_subtract_block(32, 32, src_diff, bw * 4, 1.567 + src, p->src.stride, dst, pd->dst.stride); 1.568 + if (x->use_lp32x32fdct) 1.569 + vp9_fdct32x32_rd(src_diff, coeff, bw * 4); 1.570 + else 1.571 + vp9_fdct32x32(src_diff, coeff, bw * 4); 1.572 + vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, 1.573 + p->quant, p->quant_shift, qcoeff, dqcoeff, 1.574 + pd->dequant, p->zbin_extra, eob, scan, iscan); 1.575 + } 1.576 + if (!x->skip_encode && *eob) 1.577 + vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, *eob); 1.578 + break; 1.579 + case TX_16X16: 1.580 + tx_type = get_tx_type_16x16(pd->plane_type, xd); 1.581 + scan = get_scan_16x16(tx_type); 1.582 + iscan = get_iscan_16x16(tx_type); 1.583 + mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; 1.584 + block >>= 4; 1.585 + xoff = 16 * (block & twmask); 1.586 + yoff = 16 * (block >> twl); 1.587 + dst = pd->dst.buf + yoff * pd->dst.stride + xoff; 1.588 + vp9_predict_intra_block(xd, block, bwl, TX_16X16, mode, 1.589 + dst, pd->dst.stride, dst, pd->dst.stride); 1.590 + if (!x->skip_recode) { 1.591 + src = p->src.buf + yoff * p->src.stride + xoff; 1.592 + src_diff = p->src_diff + 4 * bw * yoff + xoff; 1.593 + vp9_subtract_block(16, 16, src_diff, bw * 4, 1.594 + src, p->src.stride, dst, pd->dst.stride); 1.595 + vp9_fht16x16(tx_type, src_diff, coeff, bw * 4); 1.596 + vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, 1.597 + p->quant, p->quant_shift, qcoeff, dqcoeff, 1.598 + pd->dequant, p->zbin_extra, eob, scan, iscan); 1.599 + } 1.600 + if (!x->skip_encode && *eob) 1.601 + vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob); 1.602 + break; 1.603 + case TX_8X8: 1.604 + tx_type = get_tx_type_8x8(pd->plane_type, xd); 1.605 + scan = get_scan_8x8(tx_type); 1.606 + iscan = get_iscan_8x8(tx_type); 1.607 + mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; 1.608 + block >>= 2; 1.609 + xoff = 8 * (block & twmask); 1.610 + yoff = 8 * (block >> twl); 1.611 + dst = pd->dst.buf + yoff * pd->dst.stride + xoff; 1.612 + vp9_predict_intra_block(xd, block, bwl, TX_8X8, mode, 1.613 + dst, pd->dst.stride, dst, pd->dst.stride); 1.614 + if (!x->skip_recode) { 1.615 + src = p->src.buf + yoff * p->src.stride + xoff; 1.616 + src_diff = p->src_diff + 4 * bw * yoff + xoff; 1.617 + vp9_subtract_block(8, 8, src_diff, bw * 4, 1.618 + src, p->src.stride, dst, pd->dst.stride); 1.619 + vp9_fht8x8(tx_type, src_diff, coeff, bw * 4); 1.620 + vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, 1.621 + p->quant_shift, qcoeff, dqcoeff, 1.622 + pd->dequant, p->zbin_extra, eob, scan, iscan); 1.623 + } 1.624 + if (!x->skip_encode && *eob) 1.625 + vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob); 1.626 + break; 1.627 + case TX_4X4: 1.628 + tx_type = get_tx_type_4x4(pd->plane_type, xd, block); 1.629 + scan = get_scan_4x4(tx_type); 1.630 + iscan = get_iscan_4x4(tx_type); 1.631 + if (mbmi->sb_type < BLOCK_8X8 && plane == 0) 1.632 + mode = xd->mi_8x8[0]->bmi[block].as_mode; 1.633 + else 1.634 + mode = plane == 0 ? mbmi->mode : mbmi->uv_mode; 1.635 + 1.636 + xoff = 4 * (block & twmask); 1.637 + yoff = 4 * (block >> twl); 1.638 + dst = pd->dst.buf + yoff * pd->dst.stride + xoff; 1.639 + vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode, 1.640 + dst, pd->dst.stride, dst, pd->dst.stride); 1.641 + 1.642 + if (!x->skip_recode) { 1.643 + src = p->src.buf + yoff * p->src.stride + xoff; 1.644 + src_diff = p->src_diff + 4 * bw * yoff + xoff; 1.645 + vp9_subtract_block(4, 4, src_diff, bw * 4, 1.646 + src, p->src.stride, dst, pd->dst.stride); 1.647 + if (tx_type != DCT_DCT) 1.648 + vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type); 1.649 + else 1.650 + x->fwd_txm4x4(src_diff, coeff, bw * 4); 1.651 + vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, 1.652 + p->quant_shift, qcoeff, dqcoeff, 1.653 + pd->dequant, p->zbin_extra, eob, scan, iscan); 1.654 + } 1.655 + 1.656 + if (!x->skip_encode && *eob) { 1.657 + if (tx_type == DCT_DCT) 1.658 + // this is like vp9_short_idct4x4 but has a special case around eob<=1 1.659 + // which is significant (not just an optimization) for the lossless 1.660 + // case. 1.661 + xd->itxm_add(dqcoeff, dst, pd->dst.stride, *eob); 1.662 + else 1.663 + vp9_iht4x4_16_add(dqcoeff, dst, pd->dst.stride, tx_type); 1.664 + } 1.665 + break; 1.666 + default: 1.667 + assert(0); 1.668 + } 1.669 +} 1.670 + 1.671 +void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize) { 1.672 + MACROBLOCKD* const xd = &x->e_mbd; 1.673 + struct optimize_ctx ctx; 1.674 + struct encode_b_args arg = {x, &ctx}; 1.675 + 1.676 + foreach_transformed_block_in_plane(xd, bsize, 0, vp9_encode_block_intra, 1.677 + &arg); 1.678 +} 1.679 +void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize) { 1.680 + MACROBLOCKD* const xd = &x->e_mbd; 1.681 + struct optimize_ctx ctx; 1.682 + struct encode_b_args arg = {x, &ctx}; 1.683 + foreach_transformed_block_uv(xd, bsize, vp9_encode_block_intra, &arg); 1.684 +} 1.685 +