Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
michael@0 | 3 | * |
michael@0 | 4 | * Use of this source code is governed by a BSD-style license |
michael@0 | 5 | * that can be found in the LICENSE file in the root of the source |
michael@0 | 6 | * tree. An additional intellectual property rights grant can be found |
michael@0 | 7 | * in the file PATENTS. All contributing project authors may |
michael@0 | 8 | * be found in the AUTHORS file in the root of the source tree. |
michael@0 | 9 | */ |
michael@0 | 10 | |
michael@0 | 11 | |
michael@0 | 12 | #include "vpx_config.h" |
michael@0 | 13 | #include "vp8_rtcd.h" |
michael@0 | 14 | #include "vpx_ports/x86.h" |
michael@0 | 15 | #include "vpx_mem/vpx_mem.h" |
michael@0 | 16 | #include "vp8/encoder/block.h" |
michael@0 | 17 | #include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ |
michael@0 | 18 | |
michael@0 | 19 | #include <mmintrin.h> /* MMX */ |
michael@0 | 20 | #include <xmmintrin.h> /* SSE */ |
michael@0 | 21 | #include <emmintrin.h> /* SSE2 */ |
michael@0 | 22 | |
michael@0 | 23 | #define SELECT_EOB(i, z) \ |
michael@0 | 24 | do { \ |
michael@0 | 25 | short boost = *zbin_boost_ptr; \ |
michael@0 | 26 | int cmp = (x[z] < boost) | (y[z] == 0); \ |
michael@0 | 27 | zbin_boost_ptr++; \ |
michael@0 | 28 | if (cmp) \ |
michael@0 | 29 | goto select_eob_end_##i; \ |
michael@0 | 30 | qcoeff_ptr[z] = y[z]; \ |
michael@0 | 31 | eob = i; \ |
michael@0 | 32 | zbin_boost_ptr = b->zrun_zbin_boost; \ |
michael@0 | 33 | select_eob_end_##i:; \ |
michael@0 | 34 | } while (0) |
michael@0 | 35 | |
michael@0 | 36 | void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d) |
michael@0 | 37 | { |
michael@0 | 38 | char eob = 0; |
michael@0 | 39 | short *zbin_boost_ptr = b->zrun_zbin_boost; |
michael@0 | 40 | short *qcoeff_ptr = d->qcoeff; |
michael@0 | 41 | DECLARE_ALIGNED_ARRAY(16, short, x, 16); |
michael@0 | 42 | DECLARE_ALIGNED_ARRAY(16, short, y, 16); |
michael@0 | 43 | |
michael@0 | 44 | __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1; |
michael@0 | 45 | __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); |
michael@0 | 46 | __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); |
michael@0 | 47 | __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); |
michael@0 | 48 | __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8)); |
michael@0 | 49 | __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); |
michael@0 | 50 | __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); |
michael@0 | 51 | __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); |
michael@0 | 52 | __m128i round0 = _mm_load_si128((__m128i *)(b->round)); |
michael@0 | 53 | __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); |
michael@0 | 54 | __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); |
michael@0 | 55 | __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); |
michael@0 | 56 | __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); |
michael@0 | 57 | __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); |
michael@0 | 58 | |
michael@0 | 59 | vpx_memset(qcoeff_ptr, 0, 32); |
michael@0 | 60 | |
michael@0 | 61 | /* Duplicate to all lanes. */ |
michael@0 | 62 | zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); |
michael@0 | 63 | zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); |
michael@0 | 64 | |
michael@0 | 65 | /* Sign of z: z >> 15 */ |
michael@0 | 66 | sz0 = _mm_srai_epi16(z0, 15); |
michael@0 | 67 | sz1 = _mm_srai_epi16(z1, 15); |
michael@0 | 68 | |
michael@0 | 69 | /* x = abs(z): (z ^ sz) - sz */ |
michael@0 | 70 | x0 = _mm_xor_si128(z0, sz0); |
michael@0 | 71 | x1 = _mm_xor_si128(z1, sz1); |
michael@0 | 72 | x0 = _mm_sub_epi16(x0, sz0); |
michael@0 | 73 | x1 = _mm_sub_epi16(x1, sz1); |
michael@0 | 74 | |
michael@0 | 75 | /* zbin[] + zbin_extra */ |
michael@0 | 76 | zbin0 = _mm_add_epi16(zbin0, zbin_extra); |
michael@0 | 77 | zbin1 = _mm_add_epi16(zbin1, zbin_extra); |
michael@0 | 78 | |
michael@0 | 79 | /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance |
michael@0 | 80 | * the equation because boost is the only value which can change: |
michael@0 | 81 | * x - (zbin[] + extra) >= boost */ |
michael@0 | 82 | x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); |
michael@0 | 83 | x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); |
michael@0 | 84 | |
michael@0 | 85 | _mm_store_si128((__m128i *)(x), x_minus_zbin0); |
michael@0 | 86 | _mm_store_si128((__m128i *)(x + 8), x_minus_zbin1); |
michael@0 | 87 | |
michael@0 | 88 | /* All the remaining calculations are valid whether they are done now with |
michael@0 | 89 | * simd or later inside the loop one at a time. */ |
michael@0 | 90 | x0 = _mm_add_epi16(x0, round0); |
michael@0 | 91 | x1 = _mm_add_epi16(x1, round1); |
michael@0 | 92 | |
michael@0 | 93 | y0 = _mm_mulhi_epi16(x0, quant0); |
michael@0 | 94 | y1 = _mm_mulhi_epi16(x1, quant1); |
michael@0 | 95 | |
michael@0 | 96 | y0 = _mm_add_epi16(y0, x0); |
michael@0 | 97 | y1 = _mm_add_epi16(y1, x1); |
michael@0 | 98 | |
michael@0 | 99 | /* Instead of shifting each value independently we convert the scaling |
michael@0 | 100 | * factor with 1 << (16 - shift) so we can use multiply/return high half. */ |
michael@0 | 101 | y0 = _mm_mulhi_epi16(y0, quant_shift0); |
michael@0 | 102 | y1 = _mm_mulhi_epi16(y1, quant_shift1); |
michael@0 | 103 | |
michael@0 | 104 | /* Return the sign: (y ^ sz) - sz */ |
michael@0 | 105 | y0 = _mm_xor_si128(y0, sz0); |
michael@0 | 106 | y1 = _mm_xor_si128(y1, sz1); |
michael@0 | 107 | y0 = _mm_sub_epi16(y0, sz0); |
michael@0 | 108 | y1 = _mm_sub_epi16(y1, sz1); |
michael@0 | 109 | |
michael@0 | 110 | _mm_store_si128((__m128i *)(y), y0); |
michael@0 | 111 | _mm_store_si128((__m128i *)(y + 8), y1); |
michael@0 | 112 | |
michael@0 | 113 | zbin_boost_ptr = b->zrun_zbin_boost; |
michael@0 | 114 | |
michael@0 | 115 | /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ |
michael@0 | 116 | SELECT_EOB(1, 0); |
michael@0 | 117 | SELECT_EOB(2, 1); |
michael@0 | 118 | SELECT_EOB(3, 4); |
michael@0 | 119 | SELECT_EOB(4, 8); |
michael@0 | 120 | SELECT_EOB(5, 5); |
michael@0 | 121 | SELECT_EOB(6, 2); |
michael@0 | 122 | SELECT_EOB(7, 3); |
michael@0 | 123 | SELECT_EOB(8, 6); |
michael@0 | 124 | SELECT_EOB(9, 9); |
michael@0 | 125 | SELECT_EOB(10, 12); |
michael@0 | 126 | SELECT_EOB(11, 13); |
michael@0 | 127 | SELECT_EOB(12, 10); |
michael@0 | 128 | SELECT_EOB(13, 7); |
michael@0 | 129 | SELECT_EOB(14, 11); |
michael@0 | 130 | SELECT_EOB(15, 14); |
michael@0 | 131 | SELECT_EOB(16, 15); |
michael@0 | 132 | |
michael@0 | 133 | y0 = _mm_load_si128((__m128i *)(d->qcoeff)); |
michael@0 | 134 | y1 = _mm_load_si128((__m128i *)(d->qcoeff + 8)); |
michael@0 | 135 | |
michael@0 | 136 | /* dqcoeff = qcoeff * dequant */ |
michael@0 | 137 | y0 = _mm_mullo_epi16(y0, dequant0); |
michael@0 | 138 | y1 = _mm_mullo_epi16(y1, dequant1); |
michael@0 | 139 | |
michael@0 | 140 | _mm_store_si128((__m128i *)(d->dqcoeff), y0); |
michael@0 | 141 | _mm_store_si128((__m128i *)(d->dqcoeff + 8), y1); |
michael@0 | 142 | |
michael@0 | 143 | *d->eob = eob; |
michael@0 | 144 | } |
michael@0 | 145 | |
michael@0 | 146 | void vp8_fast_quantize_b_sse2(BLOCK *b, BLOCKD *d) |
michael@0 | 147 | { |
michael@0 | 148 | __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); |
michael@0 | 149 | __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); |
michael@0 | 150 | __m128i round0 = _mm_load_si128((__m128i *)(b->round)); |
michael@0 | 151 | __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); |
michael@0 | 152 | __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); |
michael@0 | 153 | __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); |
michael@0 | 154 | __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); |
michael@0 | 155 | __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); |
michael@0 | 156 | __m128i inv_zig_zag0 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag)); |
michael@0 | 157 | __m128i inv_zig_zag1 = _mm_load_si128((const __m128i *)(vp8_default_inv_zig_zag + 8)); |
michael@0 | 158 | |
michael@0 | 159 | __m128i sz0, sz1, x0, x1, y0, y1, xdq0, xdq1, zeros, ones; |
michael@0 | 160 | |
michael@0 | 161 | /* sign of z: z >> 15 */ |
michael@0 | 162 | sz0 = _mm_srai_epi16(z0, 15); |
michael@0 | 163 | sz1 = _mm_srai_epi16(z1, 15); |
michael@0 | 164 | |
michael@0 | 165 | /* x = abs(z): (z ^ sz) - sz */ |
michael@0 | 166 | x0 = _mm_xor_si128(z0, sz0); |
michael@0 | 167 | x1 = _mm_xor_si128(z1, sz1); |
michael@0 | 168 | x0 = _mm_sub_epi16(x0, sz0); |
michael@0 | 169 | x1 = _mm_sub_epi16(x1, sz1); |
michael@0 | 170 | |
michael@0 | 171 | /* x += round */ |
michael@0 | 172 | x0 = _mm_add_epi16(x0, round0); |
michael@0 | 173 | x1 = _mm_add_epi16(x1, round1); |
michael@0 | 174 | |
michael@0 | 175 | /* y = (x * quant) >> 16 */ |
michael@0 | 176 | y0 = _mm_mulhi_epi16(x0, quant_fast0); |
michael@0 | 177 | y1 = _mm_mulhi_epi16(x1, quant_fast1); |
michael@0 | 178 | |
michael@0 | 179 | /* x = abs(y) = (y ^ sz) - sz */ |
michael@0 | 180 | y0 = _mm_xor_si128(y0, sz0); |
michael@0 | 181 | y1 = _mm_xor_si128(y1, sz1); |
michael@0 | 182 | x0 = _mm_sub_epi16(y0, sz0); |
michael@0 | 183 | x1 = _mm_sub_epi16(y1, sz1); |
michael@0 | 184 | |
michael@0 | 185 | /* qcoeff = x */ |
michael@0 | 186 | _mm_store_si128((__m128i *)(d->qcoeff), x0); |
michael@0 | 187 | _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); |
michael@0 | 188 | |
michael@0 | 189 | /* x * dequant */ |
michael@0 | 190 | xdq0 = _mm_mullo_epi16(x0, dequant0); |
michael@0 | 191 | xdq1 = _mm_mullo_epi16(x1, dequant1); |
michael@0 | 192 | |
michael@0 | 193 | /* dqcoeff = x * dequant */ |
michael@0 | 194 | _mm_store_si128((__m128i *)(d->dqcoeff), xdq0); |
michael@0 | 195 | _mm_store_si128((__m128i *)(d->dqcoeff + 8), xdq1); |
michael@0 | 196 | |
michael@0 | 197 | /* build a mask for the zig zag */ |
michael@0 | 198 | zeros = _mm_setzero_si128(); |
michael@0 | 199 | |
michael@0 | 200 | x0 = _mm_cmpeq_epi16(x0, zeros); |
michael@0 | 201 | x1 = _mm_cmpeq_epi16(x1, zeros); |
michael@0 | 202 | |
michael@0 | 203 | ones = _mm_cmpeq_epi16(zeros, zeros); |
michael@0 | 204 | |
michael@0 | 205 | x0 = _mm_xor_si128(x0, ones); |
michael@0 | 206 | x1 = _mm_xor_si128(x1, ones); |
michael@0 | 207 | |
michael@0 | 208 | x0 = _mm_and_si128(x0, inv_zig_zag0); |
michael@0 | 209 | x1 = _mm_and_si128(x1, inv_zig_zag1); |
michael@0 | 210 | |
michael@0 | 211 | x0 = _mm_max_epi16(x0, x1); |
michael@0 | 212 | |
michael@0 | 213 | /* now down to 8 */ |
michael@0 | 214 | x1 = _mm_shuffle_epi32(x0, 0xE); // 0b00001110 |
michael@0 | 215 | |
michael@0 | 216 | x0 = _mm_max_epi16(x0, x1); |
michael@0 | 217 | |
michael@0 | 218 | /* only 4 left */ |
michael@0 | 219 | x1 = _mm_shufflelo_epi16(x0, 0xE); // 0b00001110 |
michael@0 | 220 | |
michael@0 | 221 | x0 = _mm_max_epi16(x0, x1); |
michael@0 | 222 | |
michael@0 | 223 | /* okay, just 2! */ |
michael@0 | 224 | x1 = _mm_shufflelo_epi16(x0, 0x1); // 0b00000001 |
michael@0 | 225 | |
michael@0 | 226 | x0 = _mm_max_epi16(x0, x1); |
michael@0 | 227 | |
michael@0 | 228 | *d->eob = 0xFF & _mm_cvtsi128_si32(x0); |
michael@0 | 229 | } |