1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/encoder/x86/quantize_sse2.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,229 @@ 1.4 +/* 1.5 + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 + 1.15 +#include "vpx_config.h" 1.16 +#include "vp8_rtcd.h" 1.17 +#include "vpx_ports/x86.h" 1.18 +#include "vpx_mem/vpx_mem.h" 1.19 +#include "vp8/encoder/block.h" 1.20 +#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ 1.21 + 1.22 +#include <mmintrin.h> /* MMX */ 1.23 +#include <xmmintrin.h> /* SSE */ 1.24 +#include <emmintrin.h> /* SSE2 */ 1.25 + 1.26 +#define SELECT_EOB(i, z) \ 1.27 + do { \ 1.28 + short boost = *zbin_boost_ptr; \ 1.29 + int cmp = (x[z] < boost) | (y[z] == 0); \ 1.30 + zbin_boost_ptr++; \ 1.31 + if (cmp) \ 1.32 + goto select_eob_end_##i; \ 1.33 + qcoeff_ptr[z] = y[z]; \ 1.34 + eob = i; \ 1.35 + zbin_boost_ptr = b->zrun_zbin_boost; \ 1.36 + select_eob_end_##i:; \ 1.37 + } while (0) 1.38 + 1.39 +void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) 1.40 +{ 1.41 + char eob = 0; 1.42 + short *zbin_boost_ptr = b->zrun_zbin_boost; 1.43 + short *qcoeff_ptr = d->qcoeff; 1.44 + DECLARE_ALIGNED_ARRAY(16, short, x, 16); 1.45 + DECLARE_ALIGNED_ARRAY(16, short, y, 16); 1.46 + 1.47 + __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1; 1.48 + __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); 1.49 + __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); 1.50 + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); 1.51 + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8)); 1.52 + __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); 1.53 + __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); 1.54 + __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); 1.55 + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); 1.56 + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); 1.57 + __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); 1.58 + __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); 1.59 + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); 1.60 + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); 1.61 + 1.62 + vpx_memset(qcoeff_ptr, 0, 32); 1.63 + 1.64 + /* Duplicate to all lanes. */ 1.65 + zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); 1.66 + zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); 1.67 + 1.68 + /* Sign of z: z >> 15 */ 1.69 + sz0 = _mm_srai_epi16(z0, 15); 1.70 + sz1 = _mm_srai_epi16(z1, 15); 1.71 + 1.72 + /* x = abs(z): (z ^ sz) - sz */ 1.73 + x0 = _mm_xor_si128(z0, sz0); 1.74 + x1 = _mm_xor_si128(z1, sz1); 1.75 + x0 = _mm_sub_epi16(x0, sz0); 1.76 + x1 = _mm_sub_epi16(x1, sz1); 1.77 + 1.78 + /* zbin[] + zbin_extra */ 1.79 + zbin0 = _mm_add_epi16(zbin0, zbin_extra); 1.80 + zbin1 = _mm_add_epi16(zbin1, zbin_extra); 1.81 + 1.82 + /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance 1.83 + * the equation because boost is the only value which can change: 1.84 + * x - (zbin[] + extra) >= boost */ 1.85 + x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); 1.86 + x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); 1.87 + 1.88 + _mm_store_si128((__m128i *)(x), x_minus_zbin0); 1.89 + _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1); 1.90 + 1.91 + /* All the remaining calculations are valid whether they are done now with 1.92 + * simd or later inside the loop one at a time. */ 1.93 + x0 = _mm_add_epi16(x0, round0); 1.94 + x1 = _mm_add_epi16(x1, round1); 1.95 + 1.96 + y0 = _mm_mulhi_epi16(x0, quant0); 1.97 + y1 = _mm_mulhi_epi16(x1, quant1); 1.98 + 1.99 + y0 = _mm_add_epi16(y0, x0); 1.100 + y1 = _mm_add_epi16(y1, x1); 1.101 + 1.102 + /* Instead of shifting each value independently we convert the scaling 1.103 + * factor with 1 << (16 - shift) so we can use multiply/return high half. */ 1.104 + y0 = _mm_mulhi_epi16(y0, quant_shift0); 1.105 + y1 = _mm_mulhi_epi16(y1, quant_shift1); 1.106 + 1.107 + /* Return the sign: (y ^ sz) - sz */ 1.108 + y0 = _mm_xor_si128(y0, sz0); 1.109 + y1 = _mm_xor_si128(y1, sz1); 1.110 + y0 = _mm_sub_epi16(y0, sz0); 1.111 + y1 = _mm_sub_epi16(y1, sz1); 1.112 + 1.113 + _mm_store_si128((__m128i *)(y), y0); 1.114 + _mm_store_si128((__m128i *)(y + 8), y1); 1.115 + 1.116 + zbin_boost_ptr = b->zrun_zbin_boost; 1.117 + 1.118 + /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ 1.119 + SELECT_EOB(1, 0); 1.120 + SELECT_EOB(2, 1); 1.121 + SELECT_EOB(3, 4); 1.122 + SELECT_EOB(4, 8); 1.123 + SELECT_EOB(5, 5); 1.124 + SELECT_EOB(6, 2); 1.125 + SELECT_EOB(7, 3); 1.126 + SELECT_EOB(8, 6); 1.127 + SELECT_EOB(9, 9); 1.128 + SELECT_EOB(10, 12); 1.129 + SELECT_EOB(11, 13); 1.130 + SELECT_EOB(12, 10); 1.131 + SELECT_EOB(13, 7); 1.132 + SELECT_EOB(14, 11); 1.133 + SELECT_EOB(15, 14); 1.134 + SELECT_EOB(16, 15); 1.135 + 1.136 + y0 = _mm_load_si128((__m128i *)(d->qcoeff)); 1.137 + y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8)); 1.138 + 1.139 + /* dqcoeff = qcoeff * dequant */ 1.140 + y0 = _mm_mullo_epi16(y0, dequant0); 1.141 + y1 = _mm_mullo_epi16(y1, dequant1); 1.142 + 1.143 + _mm_store_si128((__m128i *)(d->dqcoeff), y0); 1.144 + _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1); 1.145 + 1.146 + *d->eob = eob; 1.147 +} 1.148 + 1.149 +void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) 1.150 +{ 1.151 + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); 1.152 + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); 1.153 + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); 1.154 + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); 1.155 + __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); 1.156 + __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); 1.157 + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); 1.158 + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); 1.159 + __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag)); 1.160 + __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8)); 1.161 + 1.162 + __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones; 1.163 + 1.164 + /* sign of z: z >> 15 */ 1.165 + sz0 = _mm_srai_epi16(z0, 15); 1.166 + sz1 = _mm_srai_epi16(z1, 15); 1.167 + 1.168 + /* x = abs(z): (z ^ sz) - sz */ 1.169 + x0 = _mm_xor_si128(z0, sz0); 1.170 + x1 = _mm_xor_si128(z1, sz1); 1.171 + x0 = _mm_sub_epi16(x0, sz0); 1.172 + x1 = _mm_sub_epi16(x1, sz1); 1.173 + 1.174 + /* x += round */ 1.175 + x0 = _mm_add_epi16(x0, round0); 1.176 + x1 = _mm_add_epi16(x1, round1); 1.177 + 1.178 + /* y = (x * quant) >> 16 */ 1.179 + y0 = _mm_mulhi_epi16(x0, quant_fast0); 1.180 + y1 = _mm_mulhi_epi16(x1, quant_fast1); 1.181 + 1.182 + /* x = abs(y) = (y ^ sz) - sz */ 1.183 + y0 = _mm_xor_si128(y0, sz0); 1.184 + y1 = _mm_xor_si128(y1, sz1); 1.185 + x0 = _mm_sub_epi16(y0, sz0); 1.186 + x1 = _mm_sub_epi16(y1, sz1); 1.187 + 1.188 + /* qcoeff = x */ 1.189 + _mm_store_si128((__m128i *)(d->qcoeff), x0); 1.190 + _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); 1.191 + 1.192 + /* x * dequant */ 1.193 + xdq0 = _mm_mullo_epi16(x0, dequant0); 1.194 + xdq1 = _mm_mullo_epi16(x1, dequant1); 1.195 + 1.196 + /* dqcoeff = x * dequant */ 1.197 + _mm_store_si128((__m128i *)(d->dqcoeff), xdq0); 1.198 + _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1); 1.199 + 1.200 + /* build a mask for the zig zag */ 1.201 + zeros = _mm_setzero_si128(); 1.202 + 1.203 + x0 = _mm_cmpeq_epi16(x0, zeros); 1.204 + x1 = _mm_cmpeq_epi16(x1, zeros); 1.205 + 1.206 + ones = _mm_cmpeq_epi16(zeros, zeros); 1.207 + 1.208 + x0 = _mm_xor_si128(x0, ones); 1.209 + x1 = _mm_xor_si128(x1, ones); 1.210 + 1.211 + x0 = _mm_and_si128(x0, inv_zig_zag0); 1.212 + x1 = _mm_and_si128(x1, inv_zig_zag1); 1.213 + 1.214 + x0 = _mm_max_epi16(x0, x1); 1.215 + 1.216 + /* now down to 8 */ 1.217 + x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110 1.218 + 1.219 + x0 = _mm_max_epi16(x0, x1); 1.220 + 1.221 + /* only 4 left */ 1.222 + x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110 1.223 + 1.224 + x0 = _mm_max_epi16(x0, x1); 1.225 + 1.226 + /* okay, just 2! */ 1.227 + x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001 1.228 + 1.229 + x0 = _mm_max_epi16(x0, x1); 1.230 + 1.231 + *d->eob = 0xFF & _mm_cvtsi128_si32(x0); 1.232 +}