media/libvpx/vp8/encoder/x86/quantize_sse2.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/encoder/x86/quantize_sse2.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,229 @@
     1.4 +/*
     1.5 + *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
     1.6 + *
     1.7 + *  Use of this source code is governed by a BSD-style license
     1.8 + *  that can be found in the LICENSE file in the root of the source
     1.9 + *  tree. An additional intellectual property rights grant can be found
    1.10 + *  in the file PATENTS.  All contributing project authors may
    1.11 + *  be found in the AUTHORS file in the root of the source tree.
    1.12 + */
    1.13 +
    1.14 +
    1.15 +#include "vpx_config.h"
    1.16 +#include "vp8_rtcd.h"
    1.17 +#include "vpx_ports/x86.h"
    1.18 +#include "vpx_mem/vpx_mem.h"
    1.19 +#include "vp8/encoder/block.h"
    1.20 +#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
    1.21 +
    1.22 +#include <mmintrin.h> /* MMX */
    1.23 +#include <xmmintrin.h> /* SSE */
    1.24 +#include <emmintrin.h> /* SSE2 */
    1.25 +
    1.26 +#define SELECT_EOB(i, z) \
    1.27 +    do { \
    1.28 +        short boost = *zbin_boost_ptr; \
    1.29 +        int cmp = (x[z] < boost) | (y[z] == 0); \
    1.30 +        zbin_boost_ptr++; \
    1.31 +        if (cmp) \
    1.32 +            goto select_eob_end_##i; \
    1.33 +        qcoeff_ptr[z] = y[z]; \
    1.34 +        eob = i; \
    1.35 +        zbin_boost_ptr = b->zrun_zbin_boost; \
    1.36 +        select_eob_end_##i:; \
    1.37 +    } while (0)
    1.38 +
    1.39 +void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)
    1.40 +{
    1.41 +    char eob = 0;
    1.42 +    short *zbin_boost_ptr  = b->zrun_zbin_boost;
    1.43 +    short *qcoeff_ptr      = d->qcoeff;
    1.44 +    DECLARE_ALIGNED_ARRAY(16, short, x, 16);
    1.45 +    DECLARE_ALIGNED_ARRAY(16, short, y, 16);
    1.46 +
    1.47 +    __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1;
    1.48 +    __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
    1.49 +    __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
    1.50 +    __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
    1.51 +    __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8));
    1.52 +    __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra);
    1.53 +    __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin));
    1.54 +    __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8));
    1.55 +    __m128i round0 = _mm_load_si128((__m128i *)(b->round));
    1.56 +    __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
    1.57 +    __m128i quant0 = _mm_load_si128((__m128i *)(b->quant));
    1.58 +    __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
    1.59 +    __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
    1.60 +    __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
    1.61 +
    1.62 +    vpx_memset(qcoeff_ptr, 0, 32);
    1.63 +
    1.64 +    /* Duplicate to all lanes. */
    1.65 +    zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
    1.66 +    zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
    1.67 +
    1.68 +    /* Sign of z: z >> 15 */
    1.69 +    sz0 = _mm_srai_epi16(z0, 15);
    1.70 +    sz1 = _mm_srai_epi16(z1, 15);
    1.71 +
    1.72 +    /* x = abs(z): (z ^ sz) - sz */
    1.73 +    x0 = _mm_xor_si128(z0, sz0);
    1.74 +    x1 = _mm_xor_si128(z1, sz1);
    1.75 +    x0 = _mm_sub_epi16(x0, sz0);
    1.76 +    x1 = _mm_sub_epi16(x1, sz1);
    1.77 +
    1.78 +    /* zbin[] + zbin_extra */
    1.79 +    zbin0 = _mm_add_epi16(zbin0, zbin_extra);
    1.80 +    zbin1 = _mm_add_epi16(zbin1, zbin_extra);
    1.81 +
    1.82 +    /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance
    1.83 +     * the equation because boost is the only value which can change:
    1.84 +     * x - (zbin[] + extra) >= boost */
    1.85 +    x_minus_zbin0 = _mm_sub_epi16(x0, zbin0);
    1.86 +    x_minus_zbin1 = _mm_sub_epi16(x1, zbin1);
    1.87 +
    1.88 +    _mm_store_si128((__m128i *)(x), x_minus_zbin0);
    1.89 +    _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1);
    1.90 +
    1.91 +    /* All the remaining calculations are valid whether they are done now with
    1.92 +     * simd or later inside the loop one at a time. */
    1.93 +    x0 = _mm_add_epi16(x0, round0);
    1.94 +    x1 = _mm_add_epi16(x1, round1);
    1.95 +
    1.96 +    y0 = _mm_mulhi_epi16(x0, quant0);
    1.97 +    y1 = _mm_mulhi_epi16(x1, quant1);
    1.98 +
    1.99 +    y0 = _mm_add_epi16(y0, x0);
   1.100 +    y1 = _mm_add_epi16(y1, x1);
   1.101 +
   1.102 +    /* Instead of shifting each value independently we convert the scaling
   1.103 +     * factor with 1 << (16 - shift) so we can use multiply/return high half. */
   1.104 +    y0 = _mm_mulhi_epi16(y0, quant_shift0);
   1.105 +    y1 = _mm_mulhi_epi16(y1, quant_shift1);
   1.106 +
   1.107 +    /* Return the sign: (y ^ sz) - sz */
   1.108 +    y0 = _mm_xor_si128(y0, sz0);
   1.109 +    y1 = _mm_xor_si128(y1, sz1);
   1.110 +    y0 = _mm_sub_epi16(y0, sz0);
   1.111 +    y1 = _mm_sub_epi16(y1, sz1);
   1.112 +
   1.113 +    _mm_store_si128((__m128i *)(y), y0);
   1.114 +    _mm_store_si128((__m128i *)(y + 8), y1);
   1.115 +
   1.116 +    zbin_boost_ptr = b->zrun_zbin_boost;
   1.117 +
   1.118 +    /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
   1.119 +    SELECT_EOB(1, 0);
   1.120 +    SELECT_EOB(2, 1);
   1.121 +    SELECT_EOB(3, 4);
   1.122 +    SELECT_EOB(4, 8);
   1.123 +    SELECT_EOB(5, 5);
   1.124 +    SELECT_EOB(6, 2);
   1.125 +    SELECT_EOB(7, 3);
   1.126 +    SELECT_EOB(8, 6);
   1.127 +    SELECT_EOB(9, 9);
   1.128 +    SELECT_EOB(10, 12);
   1.129 +    SELECT_EOB(11, 13);
   1.130 +    SELECT_EOB(12, 10);
   1.131 +    SELECT_EOB(13, 7);
   1.132 +    SELECT_EOB(14, 11);
   1.133 +    SELECT_EOB(15, 14);
   1.134 +    SELECT_EOB(16, 15);
   1.135 +
   1.136 +    y0 = _mm_load_si128((__m128i *)(d->qcoeff));
   1.137 +    y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8));
   1.138 +
   1.139 +    /* dqcoeff = qcoeff * dequant */
   1.140 +    y0 = _mm_mullo_epi16(y0, dequant0);
   1.141 +    y1 = _mm_mullo_epi16(y1, dequant1);
   1.142 +
   1.143 +    _mm_store_si128((__m128i *)(d->dqcoeff), y0);
   1.144 +    _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1);
   1.145 +
   1.146 +    *d->eob = eob;
   1.147 +}
   1.148 +
   1.149 +void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
   1.150 +{
   1.151 +  __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
   1.152 +  __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
   1.153 +  __m128i round0 = _mm_load_si128((__m128i *)(b->round));
   1.154 +  __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
   1.155 +  __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
   1.156 +  __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
   1.157 +  __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
   1.158 +  __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
   1.159 +  __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag));
   1.160 +  __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8));
   1.161 +
   1.162 +  __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones;
   1.163 +
   1.164 +  /* sign of z: z >> 15 */
   1.165 +  sz0 = _mm_srai_epi16(z0, 15);
   1.166 +  sz1 = _mm_srai_epi16(z1, 15);
   1.167 +
   1.168 +  /* x = abs(z): (z ^ sz) - sz */
   1.169 +  x0 = _mm_xor_si128(z0, sz0);
   1.170 +  x1 = _mm_xor_si128(z1, sz1);
   1.171 +  x0 = _mm_sub_epi16(x0, sz0);
   1.172 +  x1 = _mm_sub_epi16(x1, sz1);
   1.173 +
   1.174 +  /* x += round */
   1.175 +  x0 = _mm_add_epi16(x0, round0);
   1.176 +  x1 = _mm_add_epi16(x1, round1);
   1.177 +
   1.178 +  /* y = (x * quant) >> 16 */
   1.179 +  y0 = _mm_mulhi_epi16(x0, quant_fast0);
   1.180 +  y1 = _mm_mulhi_epi16(x1, quant_fast1);
   1.181 +
   1.182 +  /* x = abs(y) = (y ^ sz) - sz */
   1.183 +  y0 = _mm_xor_si128(y0, sz0);
   1.184 +  y1 = _mm_xor_si128(y1, sz1);
   1.185 +  x0 = _mm_sub_epi16(y0, sz0);
   1.186 +  x1 = _mm_sub_epi16(y1, sz1);
   1.187 +
   1.188 +  /* qcoeff = x */
   1.189 +  _mm_store_si128((__m128i *)(d->qcoeff), x0);
   1.190 +  _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
   1.191 +
   1.192 +  /* x * dequant */
   1.193 +  xdq0 = _mm_mullo_epi16(x0, dequant0);
   1.194 +  xdq1 = _mm_mullo_epi16(x1, dequant1);
   1.195 +
   1.196 +  /* dqcoeff = x * dequant */
   1.197 +  _mm_store_si128((__m128i *)(d->dqcoeff), xdq0);
   1.198 +  _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1);
   1.199 +
   1.200 +  /* build a mask for the zig zag */
   1.201 +  zeros = _mm_setzero_si128();
   1.202 +
   1.203 +  x0 = _mm_cmpeq_epi16(x0, zeros);
   1.204 +  x1 = _mm_cmpeq_epi16(x1, zeros);
   1.205 +
   1.206 +  ones = _mm_cmpeq_epi16(zeros, zeros);
   1.207 +
   1.208 +  x0 = _mm_xor_si128(x0, ones);
   1.209 +  x1 = _mm_xor_si128(x1, ones);
   1.210 +
   1.211 +  x0 = _mm_and_si128(x0, inv_zig_zag0);
   1.212 +  x1 = _mm_and_si128(x1, inv_zig_zag1);
   1.213 +
   1.214 +  x0 = _mm_max_epi16(x0, x1);
   1.215 +
   1.216 +  /* now down to 8 */
   1.217 +  x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110
   1.218 +
   1.219 +  x0 = _mm_max_epi16(x0, x1);
   1.220 +
   1.221 +  /* only 4 left */
   1.222 +  x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110
   1.223 +
   1.224 +  x0 = _mm_max_epi16(x0, x1);
   1.225 +
   1.226 +  /* okay, just 2! */
   1.227 +  x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001
   1.228 +
   1.229 +  x0 = _mm_max_epi16(x0, x1);
   1.230 +
   1.231 +  *d->eob = 0xFF & _mm_cvtsi128_si32(x0);
   1.232 +}

mercurial