media/libvpx/vp8/encoder/x86/quantize_sse2.c

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
michael@0 3 *
michael@0 4 * Use of this source code is governed by a BSD-style license
michael@0 5 * that can be found in the LICENSE file in the root of the source
michael@0 6 * tree. An additional intellectual property rights grant can be found
michael@0 7 * in the file PATENTS. All contributing project authors may
michael@0 8 * be found in the AUTHORS file in the root of the source tree.
michael@0 9 */
michael@0 10
michael@0 11
michael@0 12 #include "vpx_config.h"
michael@0 13 #include "vp8_rtcd.h"
michael@0 14 #include "vpx_ports/x86.h"
michael@0 15 #include "vpx_mem/vpx_mem.h"
michael@0 16 #include "vp8/encoder/block.h"
michael@0 17 #include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
michael@0 18
michael@0 19 #include <mmintrin.h> /* MMX */
michael@0 20 #include <xmmintrin.h> /* SSE */
michael@0 21 #include <emmintrin.h> /* SSE2 */
michael@0 22
michael@0 23 #define SELECT_EOB(i, z) \
michael@0 24 do { \
michael@0 25 short boost = *zbin_boost_ptr; \
michael@0 26 int cmp = (x[z] < boost) | (y[z] == 0); \
michael@0 27 zbin_boost_ptr++; \
michael@0 28 if (cmp) \
michael@0 29 goto select_eob_end_##i; \
michael@0 30 qcoeff_ptr[z] = y[z]; \
michael@0 31 eob = i; \
michael@0 32 zbin_boost_ptr = b->zrun_zbin_boost; \
michael@0 33 select_eob_end_##i:; \
michael@0 34 } while (0)
michael@0 35
michael@0 36 void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)
michael@0 37 {
michael@0 38 char eob = 0;
michael@0 39 short *zbin_boost_ptr = b->zrun_zbin_boost;
michael@0 40 short *qcoeff_ptr = d->qcoeff;
michael@0 41 DECLARE_ALIGNED_ARRAY(16, short, x, 16);
michael@0 42 DECLARE_ALIGNED_ARRAY(16, short, y, 16);
michael@0 43
michael@0 44 __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1;
michael@0 45 __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
michael@0 46 __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
michael@0 47 __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
michael@0 48 __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8));
michael@0 49 __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra);
michael@0 50 __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin));
michael@0 51 __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8));
michael@0 52 __m128i round0 = _mm_load_si128((__m128i *)(b->round));
michael@0 53 __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
michael@0 54 __m128i quant0 = _mm_load_si128((__m128i *)(b->quant));
michael@0 55 __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
michael@0 56 __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
michael@0 57 __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
michael@0 58
michael@0 59 vpx_memset(qcoeff_ptr, 0, 32);
michael@0 60
michael@0 61 /* Duplicate to all lanes. */
michael@0 62 zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
michael@0 63 zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra);
michael@0 64
michael@0 65 /* Sign of z: z >> 15 */
michael@0 66 sz0 = _mm_srai_epi16(z0, 15);
michael@0 67 sz1 = _mm_srai_epi16(z1, 15);
michael@0 68
michael@0 69 /* x = abs(z): (z ^ sz) - sz */
michael@0 70 x0 = _mm_xor_si128(z0, sz0);
michael@0 71 x1 = _mm_xor_si128(z1, sz1);
michael@0 72 x0 = _mm_sub_epi16(x0, sz0);
michael@0 73 x1 = _mm_sub_epi16(x1, sz1);
michael@0 74
michael@0 75 /* zbin[] + zbin_extra */
michael@0 76 zbin0 = _mm_add_epi16(zbin0, zbin_extra);
michael@0 77 zbin1 = _mm_add_epi16(zbin1, zbin_extra);
michael@0 78
michael@0 79 /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance
michael@0 80 * the equation because boost is the only value which can change:
michael@0 81 * x - (zbin[] + extra) >= boost */
michael@0 82 x_minus_zbin0 = _mm_sub_epi16(x0, zbin0);
michael@0 83 x_minus_zbin1 = _mm_sub_epi16(x1, zbin1);
michael@0 84
michael@0 85 _mm_store_si128((__m128i *)(x), x_minus_zbin0);
michael@0 86 _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1);
michael@0 87
michael@0 88 /* All the remaining calculations are valid whether they are done now with
michael@0 89 * simd or later inside the loop one at a time. */
michael@0 90 x0 = _mm_add_epi16(x0, round0);
michael@0 91 x1 = _mm_add_epi16(x1, round1);
michael@0 92
michael@0 93 y0 = _mm_mulhi_epi16(x0, quant0);
michael@0 94 y1 = _mm_mulhi_epi16(x1, quant1);
michael@0 95
michael@0 96 y0 = _mm_add_epi16(y0, x0);
michael@0 97 y1 = _mm_add_epi16(y1, x1);
michael@0 98
michael@0 99 /* Instead of shifting each value independently we convert the scaling
michael@0 100 * factor with 1 << (16 - shift) so we can use multiply/return high half. */
michael@0 101 y0 = _mm_mulhi_epi16(y0, quant_shift0);
michael@0 102 y1 = _mm_mulhi_epi16(y1, quant_shift1);
michael@0 103
michael@0 104 /* Return the sign: (y ^ sz) - sz */
michael@0 105 y0 = _mm_xor_si128(y0, sz0);
michael@0 106 y1 = _mm_xor_si128(y1, sz1);
michael@0 107 y0 = _mm_sub_epi16(y0, sz0);
michael@0 108 y1 = _mm_sub_epi16(y1, sz1);
michael@0 109
michael@0 110 _mm_store_si128((__m128i *)(y), y0);
michael@0 111 _mm_store_si128((__m128i *)(y + 8), y1);
michael@0 112
michael@0 113 zbin_boost_ptr = b->zrun_zbin_boost;
michael@0 114
michael@0 115 /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
michael@0 116 SELECT_EOB(1, 0);
michael@0 117 SELECT_EOB(2, 1);
michael@0 118 SELECT_EOB(3, 4);
michael@0 119 SELECT_EOB(4, 8);
michael@0 120 SELECT_EOB(5, 5);
michael@0 121 SELECT_EOB(6, 2);
michael@0 122 SELECT_EOB(7, 3);
michael@0 123 SELECT_EOB(8, 6);
michael@0 124 SELECT_EOB(9, 9);
michael@0 125 SELECT_EOB(10, 12);
michael@0 126 SELECT_EOB(11, 13);
michael@0 127 SELECT_EOB(12, 10);
michael@0 128 SELECT_EOB(13, 7);
michael@0 129 SELECT_EOB(14, 11);
michael@0 130 SELECT_EOB(15, 14);
michael@0 131 SELECT_EOB(16, 15);
michael@0 132
michael@0 133 y0 = _mm_load_si128((__m128i *)(d->qcoeff));
michael@0 134 y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8));
michael@0 135
michael@0 136 /* dqcoeff = qcoeff * dequant */
michael@0 137 y0 = _mm_mullo_epi16(y0, dequant0);
michael@0 138 y1 = _mm_mullo_epi16(y1, dequant1);
michael@0 139
michael@0 140 _mm_store_si128((__m128i *)(d->dqcoeff), y0);
michael@0 141 _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1);
michael@0 142
michael@0 143 *d->eob = eob;
michael@0 144 }
michael@0 145
michael@0 146 void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d)
michael@0 147 {
michael@0 148 __m128i z0 = _mm_load_si128((__m128i *)(b->coeff));
michael@0 149 __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8));
michael@0 150 __m128i round0 = _mm_load_si128((__m128i *)(b->round));
michael@0 151 __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8));
michael@0 152 __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast));
michael@0 153 __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8));
michael@0 154 __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
michael@0 155 __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
michael@0 156 __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag));
michael@0 157 __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8));
michael@0 158
michael@0 159 __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones;
michael@0 160
michael@0 161 /* sign of z: z >> 15 */
michael@0 162 sz0 = _mm_srai_epi16(z0, 15);
michael@0 163 sz1 = _mm_srai_epi16(z1, 15);
michael@0 164
michael@0 165 /* x = abs(z): (z ^ sz) - sz */
michael@0 166 x0 = _mm_xor_si128(z0, sz0);
michael@0 167 x1 = _mm_xor_si128(z1, sz1);
michael@0 168 x0 = _mm_sub_epi16(x0, sz0);
michael@0 169 x1 = _mm_sub_epi16(x1, sz1);
michael@0 170
michael@0 171 /* x += round */
michael@0 172 x0 = _mm_add_epi16(x0, round0);
michael@0 173 x1 = _mm_add_epi16(x1, round1);
michael@0 174
michael@0 175 /* y = (x * quant) >> 16 */
michael@0 176 y0 = _mm_mulhi_epi16(x0, quant_fast0);
michael@0 177 y1 = _mm_mulhi_epi16(x1, quant_fast1);
michael@0 178
michael@0 179 /* x = abs(y) = (y ^ sz) - sz */
michael@0 180 y0 = _mm_xor_si128(y0, sz0);
michael@0 181 y1 = _mm_xor_si128(y1, sz1);
michael@0 182 x0 = _mm_sub_epi16(y0, sz0);
michael@0 183 x1 = _mm_sub_epi16(y1, sz1);
michael@0 184
michael@0 185 /* qcoeff = x */
michael@0 186 _mm_store_si128((__m128i *)(d->qcoeff), x0);
michael@0 187 _mm_store_si128((__m128i *)(d->qcoeff + 8), x1);
michael@0 188
michael@0 189 /* x * dequant */
michael@0 190 xdq0 = _mm_mullo_epi16(x0, dequant0);
michael@0 191 xdq1 = _mm_mullo_epi16(x1, dequant1);
michael@0 192
michael@0 193 /* dqcoeff = x * dequant */
michael@0 194 _mm_store_si128((__m128i *)(d->dqcoeff), xdq0);
michael@0 195 _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1);
michael@0 196
michael@0 197 /* build a mask for the zig zag */
michael@0 198 zeros = _mm_setzero_si128();
michael@0 199
michael@0 200 x0 = _mm_cmpeq_epi16(x0, zeros);
michael@0 201 x1 = _mm_cmpeq_epi16(x1, zeros);
michael@0 202
michael@0 203 ones = _mm_cmpeq_epi16(zeros, zeros);
michael@0 204
michael@0 205 x0 = _mm_xor_si128(x0, ones);
michael@0 206 x1 = _mm_xor_si128(x1, ones);
michael@0 207
michael@0 208 x0 = _mm_and_si128(x0, inv_zig_zag0);
michael@0 209 x1 = _mm_and_si128(x1, inv_zig_zag1);
michael@0 210
michael@0 211 x0 = _mm_max_epi16(x0, x1);
michael@0 212
michael@0 213 /* now down to 8 */
michael@0 214 x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110
michael@0 215
michael@0 216 x0 = _mm_max_epi16(x0, x1);
michael@0 217
michael@0 218 /* only 4 left */
michael@0 219 x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110
michael@0 220
michael@0 221 x0 = _mm_max_epi16(x0, x1);
michael@0 222
michael@0 223 /* okay, just 2! */
michael@0 224 x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001
michael@0 225
michael@0 226 x0 = _mm_max_epi16(x0, x1);
michael@0 227
michael@0 228 *d->eob = 0xFF & _mm_cvtsi128_si32(x0);
michael@0 229 }

mercurial