Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
michael@0 | 3 | * |
michael@0 | 4 | * Use of this source code is governed by a BSD-style license |
michael@0 | 5 | * that can be found in the LICENSE file in the root of the source |
michael@0 | 6 | * tree. An additional intellectual property rights grant can be found |
michael@0 | 7 | * in the file PATENTS. All contributing project authors may |
michael@0 | 8 | * be found in the AUTHORS file in the root of the source tree. |
michael@0 | 9 | */ |
michael@0 | 10 | |
michael@0 | 11 | #include <assert.h> |
michael@0 | 12 | #include <emmintrin.h> // SSE2 |
michael@0 | 13 | #include "./vpx_config.h" |
michael@0 | 14 | #include "vpx/vpx_integer.h" |
michael@0 | 15 | #include "vp9/common/vp9_common.h" |
michael@0 | 16 | #include "vp9/common/vp9_idct.h" |
michael@0 | 17 | |
michael@0 | 18 | #define RECON_AND_STORE4X4(dest, in_x) \ |
michael@0 | 19 | { \ |
michael@0 | 20 | __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ |
michael@0 | 21 | d0 = _mm_unpacklo_epi8(d0, zero); \ |
michael@0 | 22 | d0 = _mm_add_epi16(in_x, d0); \ |
michael@0 | 23 | d0 = _mm_packus_epi16(d0, d0); \ |
michael@0 | 24 | *(int *)dest = _mm_cvtsi128_si32(d0); \ |
michael@0 | 25 | dest += stride; \ |
michael@0 | 26 | } |
michael@0 | 27 | |
michael@0 | 28 | void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
michael@0 | 29 | const __m128i zero = _mm_setzero_si128(); |
michael@0 | 30 | const __m128i eight = _mm_set1_epi16(8); |
michael@0 | 31 | const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, |
michael@0 | 32 | (int16_t)cospi_16_64, (int16_t)-cospi_16_64, |
michael@0 | 33 | (int16_t)cospi_24_64, (int16_t)-cospi_8_64, |
michael@0 | 34 | (int16_t)cospi_8_64, (int16_t)cospi_24_64); |
michael@0 | 35 | const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
michael@0 | 36 | __m128i input0, input1, input2, input3; |
michael@0 | 37 | |
michael@0 | 38 | // Rows |
michael@0 | 39 | input0 = _mm_load_si128((const __m128i *)input); |
michael@0 | 40 | input2 = _mm_load_si128((const __m128i *)(input + 8)); |
michael@0 | 41 | |
michael@0 | 42 | // Construct i3, i1, i3, i1, i2, i0, i2, i0 |
michael@0 | 43 | input0 = _mm_shufflelo_epi16(input0, 0xd8); |
michael@0 | 44 | input0 = _mm_shufflehi_epi16(input0, 0xd8); |
michael@0 | 45 | input2 = _mm_shufflelo_epi16(input2, 0xd8); |
michael@0 | 46 | input2 = _mm_shufflehi_epi16(input2, 0xd8); |
michael@0 | 47 | |
michael@0 | 48 | input1 = _mm_unpackhi_epi32(input0, input0); |
michael@0 | 49 | input0 = _mm_unpacklo_epi32(input0, input0); |
michael@0 | 50 | input3 = _mm_unpackhi_epi32(input2, input2); |
michael@0 | 51 | input2 = _mm_unpacklo_epi32(input2, input2); |
michael@0 | 52 | |
michael@0 | 53 | // Stage 1 |
michael@0 | 54 | input0 = _mm_madd_epi16(input0, cst); |
michael@0 | 55 | input1 = _mm_madd_epi16(input1, cst); |
michael@0 | 56 | input2 = _mm_madd_epi16(input2, cst); |
michael@0 | 57 | input3 = _mm_madd_epi16(input3, cst); |
michael@0 | 58 | |
michael@0 | 59 | input0 = _mm_add_epi32(input0, rounding); |
michael@0 | 60 | input1 = _mm_add_epi32(input1, rounding); |
michael@0 | 61 | input2 = _mm_add_epi32(input2, rounding); |
michael@0 | 62 | input3 = _mm_add_epi32(input3, rounding); |
michael@0 | 63 | |
michael@0 | 64 | input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); |
michael@0 | 65 | input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); |
michael@0 | 66 | input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); |
michael@0 | 67 | input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); |
michael@0 | 68 | |
michael@0 | 69 | // Stage 2 |
michael@0 | 70 | input0 = _mm_packs_epi32(input0, input1); |
michael@0 | 71 | input1 = _mm_packs_epi32(input2, input3); |
michael@0 | 72 | |
michael@0 | 73 | // Transpose |
michael@0 | 74 | input2 = _mm_unpacklo_epi16(input0, input1); |
michael@0 | 75 | input3 = _mm_unpackhi_epi16(input0, input1); |
michael@0 | 76 | input0 = _mm_unpacklo_epi32(input2, input3); |
michael@0 | 77 | input1 = _mm_unpackhi_epi32(input2, input3); |
michael@0 | 78 | |
michael@0 | 79 | // Switch column2, column 3, and then, we got: |
michael@0 | 80 | // input2: column1, column 0; input3: column2, column 3. |
michael@0 | 81 | input1 = _mm_shuffle_epi32(input1, 0x4e); |
michael@0 | 82 | input2 = _mm_add_epi16(input0, input1); |
michael@0 | 83 | input3 = _mm_sub_epi16(input0, input1); |
michael@0 | 84 | |
michael@0 | 85 | // Columns |
michael@0 | 86 | // Construct i3, i1, i3, i1, i2, i0, i2, i0 |
michael@0 | 87 | input0 = _mm_unpacklo_epi32(input2, input2); |
michael@0 | 88 | input1 = _mm_unpackhi_epi32(input2, input2); |
michael@0 | 89 | input2 = _mm_unpackhi_epi32(input3, input3); |
michael@0 | 90 | input3 = _mm_unpacklo_epi32(input3, input3); |
michael@0 | 91 | |
michael@0 | 92 | // Stage 1 |
michael@0 | 93 | input0 = _mm_madd_epi16(input0, cst); |
michael@0 | 94 | input1 = _mm_madd_epi16(input1, cst); |
michael@0 | 95 | input2 = _mm_madd_epi16(input2, cst); |
michael@0 | 96 | input3 = _mm_madd_epi16(input3, cst); |
michael@0 | 97 | |
michael@0 | 98 | input0 = _mm_add_epi32(input0, rounding); |
michael@0 | 99 | input1 = _mm_add_epi32(input1, rounding); |
michael@0 | 100 | input2 = _mm_add_epi32(input2, rounding); |
michael@0 | 101 | input3 = _mm_add_epi32(input3, rounding); |
michael@0 | 102 | |
michael@0 | 103 | input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); |
michael@0 | 104 | input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); |
michael@0 | 105 | input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); |
michael@0 | 106 | input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); |
michael@0 | 107 | |
michael@0 | 108 | // Stage 2 |
michael@0 | 109 | input0 = _mm_packs_epi32(input0, input2); |
michael@0 | 110 | input1 = _mm_packs_epi32(input1, input3); |
michael@0 | 111 | |
michael@0 | 112 | // Transpose |
michael@0 | 113 | input2 = _mm_unpacklo_epi16(input0, input1); |
michael@0 | 114 | input3 = _mm_unpackhi_epi16(input0, input1); |
michael@0 | 115 | input0 = _mm_unpacklo_epi32(input2, input3); |
michael@0 | 116 | input1 = _mm_unpackhi_epi32(input2, input3); |
michael@0 | 117 | |
michael@0 | 118 | // Switch column2, column 3, and then, we got: |
michael@0 | 119 | // input2: column1, column 0; input3: column2, column 3. |
michael@0 | 120 | input1 = _mm_shuffle_epi32(input1, 0x4e); |
michael@0 | 121 | input2 = _mm_add_epi16(input0, input1); |
michael@0 | 122 | input3 = _mm_sub_epi16(input0, input1); |
michael@0 | 123 | |
michael@0 | 124 | // Final round and shift |
michael@0 | 125 | input2 = _mm_add_epi16(input2, eight); |
michael@0 | 126 | input3 = _mm_add_epi16(input3, eight); |
michael@0 | 127 | |
michael@0 | 128 | input2 = _mm_srai_epi16(input2, 4); |
michael@0 | 129 | input3 = _mm_srai_epi16(input3, 4); |
michael@0 | 130 | |
michael@0 | 131 | // Reconstruction and Store |
michael@0 | 132 | { |
michael@0 | 133 | __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); |
michael@0 | 134 | __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); |
michael@0 | 135 | d0 = _mm_unpacklo_epi32(d0, |
michael@0 | 136 | _mm_cvtsi32_si128(*(const int *) (dest + stride))); |
michael@0 | 137 | d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128( |
michael@0 | 138 | *(const int *) (dest + stride * 3)), d2); |
michael@0 | 139 | d0 = _mm_unpacklo_epi8(d0, zero); |
michael@0 | 140 | d2 = _mm_unpacklo_epi8(d2, zero); |
michael@0 | 141 | d0 = _mm_add_epi16(d0, input2); |
michael@0 | 142 | d2 = _mm_add_epi16(d2, input3); |
michael@0 | 143 | d0 = _mm_packus_epi16(d0, d2); |
michael@0 | 144 | // store input0 |
michael@0 | 145 | *(int *)dest = _mm_cvtsi128_si32(d0); |
michael@0 | 146 | // store input1 |
michael@0 | 147 | d0 = _mm_srli_si128(d0, 4); |
michael@0 | 148 | *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); |
michael@0 | 149 | // store input2 |
michael@0 | 150 | d0 = _mm_srli_si128(d0, 4); |
michael@0 | 151 | *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); |
michael@0 | 152 | // store input3 |
michael@0 | 153 | d0 = _mm_srli_si128(d0, 4); |
michael@0 | 154 | *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); |
michael@0 | 155 | } |
michael@0 | 156 | } |
michael@0 | 157 | |
michael@0 | 158 | void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
michael@0 | 159 | __m128i dc_value; |
michael@0 | 160 | const __m128i zero = _mm_setzero_si128(); |
michael@0 | 161 | int a; |
michael@0 | 162 | |
michael@0 | 163 | a = dct_const_round_shift(input[0] * cospi_16_64); |
michael@0 | 164 | a = dct_const_round_shift(a * cospi_16_64); |
michael@0 | 165 | a = ROUND_POWER_OF_TWO(a, 4); |
michael@0 | 166 | |
michael@0 | 167 | dc_value = _mm_set1_epi16(a); |
michael@0 | 168 | |
michael@0 | 169 | RECON_AND_STORE4X4(dest, dc_value); |
michael@0 | 170 | RECON_AND_STORE4X4(dest, dc_value); |
michael@0 | 171 | RECON_AND_STORE4X4(dest, dc_value); |
michael@0 | 172 | RECON_AND_STORE4X4(dest, dc_value); |
michael@0 | 173 | } |
michael@0 | 174 | |
michael@0 | 175 | static INLINE void transpose_4x4(__m128i *res) { |
michael@0 | 176 | const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); |
michael@0 | 177 | const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]); |
michael@0 | 178 | res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); |
michael@0 | 179 | res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); |
michael@0 | 180 | |
michael@0 | 181 | res[1] = _mm_unpackhi_epi64(res[0], res[0]); |
michael@0 | 182 | res[3] = _mm_unpackhi_epi64(res[2], res[2]); |
michael@0 | 183 | } |
michael@0 | 184 | |
michael@0 | 185 | static void idct4_1d_sse2(__m128i *in) { |
michael@0 | 186 | const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); |
michael@0 | 187 | const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
michael@0 | 188 | const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
michael@0 | 189 | const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
michael@0 | 190 | const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
michael@0 | 191 | __m128i u[8], v[8]; |
michael@0 | 192 | |
michael@0 | 193 | transpose_4x4(in); |
michael@0 | 194 | // stage 1 |
michael@0 | 195 | u[0] = _mm_unpacklo_epi16(in[0], in[2]); |
michael@0 | 196 | u[1] = _mm_unpacklo_epi16(in[1], in[3]); |
michael@0 | 197 | v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); |
michael@0 | 198 | v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); |
michael@0 | 199 | v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); |
michael@0 | 200 | v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); |
michael@0 | 201 | |
michael@0 | 202 | u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); |
michael@0 | 203 | u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); |
michael@0 | 204 | u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); |
michael@0 | 205 | u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); |
michael@0 | 206 | |
michael@0 | 207 | v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
michael@0 | 208 | v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
michael@0 | 209 | v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
michael@0 | 210 | v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
michael@0 | 211 | |
michael@0 | 212 | u[0] = _mm_packs_epi32(v[0], v[2]); |
michael@0 | 213 | u[1] = _mm_packs_epi32(v[1], v[3]); |
michael@0 | 214 | u[2] = _mm_unpackhi_epi64(u[0], u[0]); |
michael@0 | 215 | u[3] = _mm_unpackhi_epi64(u[1], u[1]); |
michael@0 | 216 | |
michael@0 | 217 | // stage 2 |
michael@0 | 218 | in[0] = _mm_add_epi16(u[0], u[3]); |
michael@0 | 219 | in[1] = _mm_add_epi16(u[1], u[2]); |
michael@0 | 220 | in[2] = _mm_sub_epi16(u[1], u[2]); |
michael@0 | 221 | in[3] = _mm_sub_epi16(u[0], u[3]); |
michael@0 | 222 | } |
michael@0 | 223 | |
michael@0 | 224 | static void iadst4_1d_sse2(__m128i *in) { |
michael@0 | 225 | const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); |
michael@0 | 226 | const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); |
michael@0 | 227 | const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); |
michael@0 | 228 | const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); |
michael@0 | 229 | const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); |
michael@0 | 230 | const __m128i kZero = _mm_set1_epi16(0); |
michael@0 | 231 | const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
michael@0 | 232 | __m128i u[8], v[8], in7; |
michael@0 | 233 | |
michael@0 | 234 | transpose_4x4(in); |
michael@0 | 235 | in7 = _mm_add_epi16(in[0], in[3]); |
michael@0 | 236 | in7 = _mm_sub_epi16(in7, in[2]); |
michael@0 | 237 | |
michael@0 | 238 | u[0] = _mm_unpacklo_epi16(in[0], in[2]); |
michael@0 | 239 | u[1] = _mm_unpacklo_epi16(in[1], in[3]); |
michael@0 | 240 | u[2] = _mm_unpacklo_epi16(in7, kZero); |
michael@0 | 241 | u[3] = _mm_unpacklo_epi16(in[1], kZero); |
michael@0 | 242 | |
michael@0 | 243 | v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 |
michael@0 | 244 | v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 |
michael@0 | 245 | v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 |
michael@0 | 246 | v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 |
michael@0 | 247 | v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 |
michael@0 | 248 | v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 |
michael@0 | 249 | |
michael@0 | 250 | u[0] = _mm_add_epi32(v[0], v[1]); |
michael@0 | 251 | u[1] = _mm_add_epi32(v[3], v[4]); |
michael@0 | 252 | u[2] = v[2]; |
michael@0 | 253 | u[3] = _mm_add_epi32(u[0], u[1]); |
michael@0 | 254 | u[4] = _mm_slli_epi32(v[5], 2); |
michael@0 | 255 | u[5] = _mm_add_epi32(u[3], v[5]); |
michael@0 | 256 | u[6] = _mm_sub_epi32(u[5], u[4]); |
michael@0 | 257 | |
michael@0 | 258 | v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); |
michael@0 | 259 | v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); |
michael@0 | 260 | v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); |
michael@0 | 261 | v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); |
michael@0 | 262 | |
michael@0 | 263 | u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
michael@0 | 264 | u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
michael@0 | 265 | u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
michael@0 | 266 | u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
michael@0 | 267 | |
michael@0 | 268 | in[0] = _mm_packs_epi32(u[0], u[2]); |
michael@0 | 269 | in[1] = _mm_packs_epi32(u[1], u[3]); |
michael@0 | 270 | in[2] = _mm_unpackhi_epi64(in[0], in[0]); |
michael@0 | 271 | in[3] = _mm_unpackhi_epi64(in[1], in[1]); |
michael@0 | 272 | } |
michael@0 | 273 | |
michael@0 | 274 | void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
michael@0 | 275 | int tx_type) { |
michael@0 | 276 | __m128i in[4]; |
michael@0 | 277 | const __m128i zero = _mm_setzero_si128(); |
michael@0 | 278 | const __m128i eight = _mm_set1_epi16(8); |
michael@0 | 279 | |
michael@0 | 280 | in[0] = _mm_loadl_epi64((const __m128i *)input); |
michael@0 | 281 | in[1] = _mm_loadl_epi64((const __m128i *)(input + 4)); |
michael@0 | 282 | in[2] = _mm_loadl_epi64((const __m128i *)(input + 8)); |
michael@0 | 283 | in[3] = _mm_loadl_epi64((const __m128i *)(input + 12)); |
michael@0 | 284 | |
michael@0 | 285 | switch (tx_type) { |
michael@0 | 286 | case 0: // DCT_DCT |
michael@0 | 287 | idct4_1d_sse2(in); |
michael@0 | 288 | idct4_1d_sse2(in); |
michael@0 | 289 | break; |
michael@0 | 290 | case 1: // ADST_DCT |
michael@0 | 291 | idct4_1d_sse2(in); |
michael@0 | 292 | iadst4_1d_sse2(in); |
michael@0 | 293 | break; |
michael@0 | 294 | case 2: // DCT_ADST |
michael@0 | 295 | iadst4_1d_sse2(in); |
michael@0 | 296 | idct4_1d_sse2(in); |
michael@0 | 297 | break; |
michael@0 | 298 | case 3: // ADST_ADST |
michael@0 | 299 | iadst4_1d_sse2(in); |
michael@0 | 300 | iadst4_1d_sse2(in); |
michael@0 | 301 | break; |
michael@0 | 302 | default: |
michael@0 | 303 | assert(0); |
michael@0 | 304 | break; |
michael@0 | 305 | } |
michael@0 | 306 | |
michael@0 | 307 | // Final round and shift |
michael@0 | 308 | in[0] = _mm_add_epi16(in[0], eight); |
michael@0 | 309 | in[1] = _mm_add_epi16(in[1], eight); |
michael@0 | 310 | in[2] = _mm_add_epi16(in[2], eight); |
michael@0 | 311 | in[3] = _mm_add_epi16(in[3], eight); |
michael@0 | 312 | |
michael@0 | 313 | in[0] = _mm_srai_epi16(in[0], 4); |
michael@0 | 314 | in[1] = _mm_srai_epi16(in[1], 4); |
michael@0 | 315 | in[2] = _mm_srai_epi16(in[2], 4); |
michael@0 | 316 | in[3] = _mm_srai_epi16(in[3], 4); |
michael@0 | 317 | |
michael@0 | 318 | RECON_AND_STORE4X4(dest, in[0]); |
michael@0 | 319 | RECON_AND_STORE4X4(dest, in[1]); |
michael@0 | 320 | RECON_AND_STORE4X4(dest, in[2]); |
michael@0 | 321 | RECON_AND_STORE4X4(dest, in[3]); |
michael@0 | 322 | } |
michael@0 | 323 | |
michael@0 | 324 | #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ |
michael@0 | 325 | out0, out1, out2, out3, out4, out5, out6, out7) \ |
michael@0 | 326 | { \ |
michael@0 | 327 | const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ |
michael@0 | 328 | const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ |
michael@0 | 329 | const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ |
michael@0 | 330 | const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ |
michael@0 | 331 | const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ |
michael@0 | 332 | const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ |
michael@0 | 333 | const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ |
michael@0 | 334 | const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ |
michael@0 | 335 | \ |
michael@0 | 336 | const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ |
michael@0 | 337 | const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ |
michael@0 | 338 | const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ |
michael@0 | 339 | const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ |
michael@0 | 340 | const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ |
michael@0 | 341 | const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ |
michael@0 | 342 | const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ |
michael@0 | 343 | const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ |
michael@0 | 344 | \ |
michael@0 | 345 | out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ |
michael@0 | 346 | out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ |
michael@0 | 347 | out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ |
michael@0 | 348 | out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ |
michael@0 | 349 | out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ |
michael@0 | 350 | out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ |
michael@0 | 351 | out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ |
michael@0 | 352 | out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ |
michael@0 | 353 | } |
michael@0 | 354 | |
michael@0 | 355 | #define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \ |
michael@0 | 356 | out0, out1, out2, out3, out4, out5, out6, out7) \ |
michael@0 | 357 | { \ |
michael@0 | 358 | const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ |
michael@0 | 359 | const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ |
michael@0 | 360 | const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ |
michael@0 | 361 | const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ |
michael@0 | 362 | \ |
michael@0 | 363 | const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ |
michael@0 | 364 | const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ |
michael@0 | 365 | const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ |
michael@0 | 366 | const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ |
michael@0 | 367 | \ |
michael@0 | 368 | out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ |
michael@0 | 369 | out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ |
michael@0 | 370 | out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ |
michael@0 | 371 | out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ |
michael@0 | 372 | out4 = out5 = out6 = out7 = zero; \ |
michael@0 | 373 | } |
michael@0 | 374 | |
michael@0 | 375 | #define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \ |
michael@0 | 376 | { \ |
michael@0 | 377 | const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ |
michael@0 | 378 | const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ |
michael@0 | 379 | const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ |
michael@0 | 380 | const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ |
michael@0 | 381 | \ |
michael@0 | 382 | in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ |
michael@0 | 383 | in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ |
michael@0 | 384 | in2 = _mm_unpacklo_epi32(tr0_2, tr0_3); /* i5 i4 */ \ |
michael@0 | 385 | in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \ |
michael@0 | 386 | } |
michael@0 | 387 | |
michael@0 | 388 | // Define Macro for multiplying elements by constants and adding them together. |
michael@0 | 389 | #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ |
michael@0 | 390 | cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ |
michael@0 | 391 | { \ |
michael@0 | 392 | tmp0 = _mm_madd_epi16(lo_0, cst0); \ |
michael@0 | 393 | tmp1 = _mm_madd_epi16(hi_0, cst0); \ |
michael@0 | 394 | tmp2 = _mm_madd_epi16(lo_0, cst1); \ |
michael@0 | 395 | tmp3 = _mm_madd_epi16(hi_0, cst1); \ |
michael@0 | 396 | tmp4 = _mm_madd_epi16(lo_1, cst2); \ |
michael@0 | 397 | tmp5 = _mm_madd_epi16(hi_1, cst2); \ |
michael@0 | 398 | tmp6 = _mm_madd_epi16(lo_1, cst3); \ |
michael@0 | 399 | tmp7 = _mm_madd_epi16(hi_1, cst3); \ |
michael@0 | 400 | \ |
michael@0 | 401 | tmp0 = _mm_add_epi32(tmp0, rounding); \ |
michael@0 | 402 | tmp1 = _mm_add_epi32(tmp1, rounding); \ |
michael@0 | 403 | tmp2 = _mm_add_epi32(tmp2, rounding); \ |
michael@0 | 404 | tmp3 = _mm_add_epi32(tmp3, rounding); \ |
michael@0 | 405 | tmp4 = _mm_add_epi32(tmp4, rounding); \ |
michael@0 | 406 | tmp5 = _mm_add_epi32(tmp5, rounding); \ |
michael@0 | 407 | tmp6 = _mm_add_epi32(tmp6, rounding); \ |
michael@0 | 408 | tmp7 = _mm_add_epi32(tmp7, rounding); \ |
michael@0 | 409 | \ |
michael@0 | 410 | tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ |
michael@0 | 411 | tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ |
michael@0 | 412 | tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ |
michael@0 | 413 | tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ |
michael@0 | 414 | tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ |
michael@0 | 415 | tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ |
michael@0 | 416 | tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ |
michael@0 | 417 | tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ |
michael@0 | 418 | \ |
michael@0 | 419 | res0 = _mm_packs_epi32(tmp0, tmp1); \ |
michael@0 | 420 | res1 = _mm_packs_epi32(tmp2, tmp3); \ |
michael@0 | 421 | res2 = _mm_packs_epi32(tmp4, tmp5); \ |
michael@0 | 422 | res3 = _mm_packs_epi32(tmp6, tmp7); \ |
michael@0 | 423 | } |
michael@0 | 424 | |
michael@0 | 425 | #define IDCT8_1D \ |
michael@0 | 426 | /* Stage1 */ \ |
michael@0 | 427 | { \ |
michael@0 | 428 | const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ |
michael@0 | 429 | const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ |
michael@0 | 430 | const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ |
michael@0 | 431 | const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ |
michael@0 | 432 | \ |
michael@0 | 433 | MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ |
michael@0 | 434 | stg1_1, stg1_2, stg1_3, stp1_4, \ |
michael@0 | 435 | stp1_7, stp1_5, stp1_6) \ |
michael@0 | 436 | } \ |
michael@0 | 437 | \ |
michael@0 | 438 | /* Stage2 */ \ |
michael@0 | 439 | { \ |
michael@0 | 440 | const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ |
michael@0 | 441 | const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ |
michael@0 | 442 | const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ |
michael@0 | 443 | const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ |
michael@0 | 444 | \ |
michael@0 | 445 | MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ |
michael@0 | 446 | stg2_1, stg2_2, stg2_3, stp2_0, \ |
michael@0 | 447 | stp2_1, stp2_2, stp2_3) \ |
michael@0 | 448 | \ |
michael@0 | 449 | stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ |
michael@0 | 450 | stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ |
michael@0 | 451 | stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ |
michael@0 | 452 | stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ |
michael@0 | 453 | } \ |
michael@0 | 454 | \ |
michael@0 | 455 | /* Stage3 */ \ |
michael@0 | 456 | { \ |
michael@0 | 457 | const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ |
michael@0 | 458 | const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ |
michael@0 | 459 | \ |
michael@0 | 460 | stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ |
michael@0 | 461 | stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ |
michael@0 | 462 | stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ |
michael@0 | 463 | stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ |
michael@0 | 464 | \ |
michael@0 | 465 | tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ |
michael@0 | 466 | tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ |
michael@0 | 467 | tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ |
michael@0 | 468 | tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ |
michael@0 | 469 | \ |
michael@0 | 470 | tmp0 = _mm_add_epi32(tmp0, rounding); \ |
michael@0 | 471 | tmp1 = _mm_add_epi32(tmp1, rounding); \ |
michael@0 | 472 | tmp2 = _mm_add_epi32(tmp2, rounding); \ |
michael@0 | 473 | tmp3 = _mm_add_epi32(tmp3, rounding); \ |
michael@0 | 474 | \ |
michael@0 | 475 | tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ |
michael@0 | 476 | tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ |
michael@0 | 477 | tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ |
michael@0 | 478 | tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ |
michael@0 | 479 | \ |
michael@0 | 480 | stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ |
michael@0 | 481 | stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ |
michael@0 | 482 | } \ |
michael@0 | 483 | \ |
michael@0 | 484 | /* Stage4 */ \ |
michael@0 | 485 | in0 = _mm_adds_epi16(stp1_0, stp2_7); \ |
michael@0 | 486 | in1 = _mm_adds_epi16(stp1_1, stp1_6); \ |
michael@0 | 487 | in2 = _mm_adds_epi16(stp1_2, stp1_5); \ |
michael@0 | 488 | in3 = _mm_adds_epi16(stp1_3, stp2_4); \ |
michael@0 | 489 | in4 = _mm_subs_epi16(stp1_3, stp2_4); \ |
michael@0 | 490 | in5 = _mm_subs_epi16(stp1_2, stp1_5); \ |
michael@0 | 491 | in6 = _mm_subs_epi16(stp1_1, stp1_6); \ |
michael@0 | 492 | in7 = _mm_subs_epi16(stp1_0, stp2_7); |
michael@0 | 493 | |
michael@0 | 494 | #define RECON_AND_STORE(dest, in_x) \ |
michael@0 | 495 | { \ |
michael@0 | 496 | __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ |
michael@0 | 497 | d0 = _mm_unpacklo_epi8(d0, zero); \ |
michael@0 | 498 | d0 = _mm_add_epi16(in_x, d0); \ |
michael@0 | 499 | d0 = _mm_packus_epi16(d0, d0); \ |
michael@0 | 500 | _mm_storel_epi64((__m128i *)(dest), d0); \ |
michael@0 | 501 | dest += stride; \ |
michael@0 | 502 | } |
michael@0 | 503 | |
michael@0 | 504 | void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
michael@0 | 505 | const __m128i zero = _mm_setzero_si128(); |
michael@0 | 506 | const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
michael@0 | 507 | const __m128i final_rounding = _mm_set1_epi16(1<<4); |
michael@0 | 508 | const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
michael@0 | 509 | const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
michael@0 | 510 | const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
michael@0 | 511 | const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
michael@0 | 512 | const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
michael@0 | 513 | const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
michael@0 | 514 | const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
michael@0 | 515 | const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
michael@0 | 516 | |
michael@0 | 517 | __m128i in0, in1, in2, in3, in4, in5, in6, in7; |
michael@0 | 518 | __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; |
michael@0 | 519 | __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; |
michael@0 | 520 | __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
michael@0 | 521 | int i; |
michael@0 | 522 | |
michael@0 | 523 | // Load input data. |
michael@0 | 524 | in0 = _mm_load_si128((const __m128i *)input); |
michael@0 | 525 | in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
michael@0 | 526 | in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
michael@0 | 527 | in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
michael@0 | 528 | in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
michael@0 | 529 | in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
michael@0 | 530 | in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
michael@0 | 531 | in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
michael@0 | 532 | |
michael@0 | 533 | // 2-D |
michael@0 | 534 | for (i = 0; i < 2; i++) { |
michael@0 | 535 | // 8x8 Transpose is copied from vp9_fdct8x8_sse2() |
michael@0 | 536 | TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, |
michael@0 | 537 | in4, in5, in6, in7); |
michael@0 | 538 | |
michael@0 | 539 | // 4-stage 1D idct8x8 |
michael@0 | 540 | IDCT8_1D |
michael@0 | 541 | } |
michael@0 | 542 | |
michael@0 | 543 | // Final rounding and shift |
michael@0 | 544 | in0 = _mm_adds_epi16(in0, final_rounding); |
michael@0 | 545 | in1 = _mm_adds_epi16(in1, final_rounding); |
michael@0 | 546 | in2 = _mm_adds_epi16(in2, final_rounding); |
michael@0 | 547 | in3 = _mm_adds_epi16(in3, final_rounding); |
michael@0 | 548 | in4 = _mm_adds_epi16(in4, final_rounding); |
michael@0 | 549 | in5 = _mm_adds_epi16(in5, final_rounding); |
michael@0 | 550 | in6 = _mm_adds_epi16(in6, final_rounding); |
michael@0 | 551 | in7 = _mm_adds_epi16(in7, final_rounding); |
michael@0 | 552 | |
michael@0 | 553 | in0 = _mm_srai_epi16(in0, 5); |
michael@0 | 554 | in1 = _mm_srai_epi16(in1, 5); |
michael@0 | 555 | in2 = _mm_srai_epi16(in2, 5); |
michael@0 | 556 | in3 = _mm_srai_epi16(in3, 5); |
michael@0 | 557 | in4 = _mm_srai_epi16(in4, 5); |
michael@0 | 558 | in5 = _mm_srai_epi16(in5, 5); |
michael@0 | 559 | in6 = _mm_srai_epi16(in6, 5); |
michael@0 | 560 | in7 = _mm_srai_epi16(in7, 5); |
michael@0 | 561 | |
michael@0 | 562 | RECON_AND_STORE(dest, in0); |
michael@0 | 563 | RECON_AND_STORE(dest, in1); |
michael@0 | 564 | RECON_AND_STORE(dest, in2); |
michael@0 | 565 | RECON_AND_STORE(dest, in3); |
michael@0 | 566 | RECON_AND_STORE(dest, in4); |
michael@0 | 567 | RECON_AND_STORE(dest, in5); |
michael@0 | 568 | RECON_AND_STORE(dest, in6); |
michael@0 | 569 | RECON_AND_STORE(dest, in7); |
michael@0 | 570 | } |
michael@0 | 571 | |
michael@0 | 572 | void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
michael@0 | 573 | __m128i dc_value; |
michael@0 | 574 | const __m128i zero = _mm_setzero_si128(); |
michael@0 | 575 | int a; |
michael@0 | 576 | |
michael@0 | 577 | a = dct_const_round_shift(input[0] * cospi_16_64); |
michael@0 | 578 | a = dct_const_round_shift(a * cospi_16_64); |
michael@0 | 579 | a = ROUND_POWER_OF_TWO(a, 5); |
michael@0 | 580 | |
michael@0 | 581 | dc_value = _mm_set1_epi16(a); |
michael@0 | 582 | |
michael@0 | 583 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 584 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 585 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 586 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 587 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 588 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 589 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 590 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 591 | } |
michael@0 | 592 | |
michael@0 | 593 | // perform 8x8 transpose |
michael@0 | 594 | static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { |
michael@0 | 595 | const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); |
michael@0 | 596 | const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); |
michael@0 | 597 | const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); |
michael@0 | 598 | const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); |
michael@0 | 599 | const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); |
michael@0 | 600 | const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); |
michael@0 | 601 | const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); |
michael@0 | 602 | const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); |
michael@0 | 603 | |
michael@0 | 604 | const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); |
michael@0 | 605 | const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); |
michael@0 | 606 | const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); |
michael@0 | 607 | const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); |
michael@0 | 608 | const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); |
michael@0 | 609 | const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); |
michael@0 | 610 | const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); |
michael@0 | 611 | const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); |
michael@0 | 612 | |
michael@0 | 613 | res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); |
michael@0 | 614 | res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); |
michael@0 | 615 | res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); |
michael@0 | 616 | res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); |
michael@0 | 617 | res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); |
michael@0 | 618 | res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); |
michael@0 | 619 | res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); |
michael@0 | 620 | res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); |
michael@0 | 621 | } |
michael@0 | 622 | |
michael@0 | 623 | static void idct8_1d_sse2(__m128i *in) { |
michael@0 | 624 | const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
michael@0 | 625 | const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
michael@0 | 626 | const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
michael@0 | 627 | const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
michael@0 | 628 | const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
michael@0 | 629 | const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
michael@0 | 630 | const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
michael@0 | 631 | const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
michael@0 | 632 | const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
michael@0 | 633 | |
michael@0 | 634 | __m128i in0, in1, in2, in3, in4, in5, in6, in7; |
michael@0 | 635 | __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; |
michael@0 | 636 | __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; |
michael@0 | 637 | __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
michael@0 | 638 | |
michael@0 | 639 | in0 = in[0]; |
michael@0 | 640 | in1 = in[1]; |
michael@0 | 641 | in2 = in[2]; |
michael@0 | 642 | in3 = in[3]; |
michael@0 | 643 | in4 = in[4]; |
michael@0 | 644 | in5 = in[5]; |
michael@0 | 645 | in6 = in[6]; |
michael@0 | 646 | in7 = in[7]; |
michael@0 | 647 | |
michael@0 | 648 | // 8x8 Transpose is copied from vp9_fdct8x8_sse2() |
michael@0 | 649 | TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, |
michael@0 | 650 | in4, in5, in6, in7); |
michael@0 | 651 | |
michael@0 | 652 | // 4-stage 1D idct8x8 |
michael@0 | 653 | IDCT8_1D |
michael@0 | 654 | in[0] = in0; |
michael@0 | 655 | in[1] = in1; |
michael@0 | 656 | in[2] = in2; |
michael@0 | 657 | in[3] = in3; |
michael@0 | 658 | in[4] = in4; |
michael@0 | 659 | in[5] = in5; |
michael@0 | 660 | in[6] = in6; |
michael@0 | 661 | in[7] = in7; |
michael@0 | 662 | } |
michael@0 | 663 | |
michael@0 | 664 | static void iadst8_1d_sse2(__m128i *in) { |
michael@0 | 665 | const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); |
michael@0 | 666 | const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
michael@0 | 667 | const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); |
michael@0 | 668 | const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
michael@0 | 669 | const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); |
michael@0 | 670 | const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
michael@0 | 671 | const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); |
michael@0 | 672 | const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
michael@0 | 673 | const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
michael@0 | 674 | const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
michael@0 | 675 | const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); |
michael@0 | 676 | const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
michael@0 | 677 | const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
michael@0 | 678 | const __m128i k__const_0 = _mm_set1_epi16(0); |
michael@0 | 679 | const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
michael@0 | 680 | |
michael@0 | 681 | __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; |
michael@0 | 682 | __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; |
michael@0 | 683 | __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; |
michael@0 | 684 | __m128i s0, s1, s2, s3, s4, s5, s6, s7; |
michael@0 | 685 | __m128i in0, in1, in2, in3, in4, in5, in6, in7; |
michael@0 | 686 | |
michael@0 | 687 | // transpose |
michael@0 | 688 | array_transpose_8x8(in, in); |
michael@0 | 689 | |
michael@0 | 690 | // properly aligned for butterfly input |
michael@0 | 691 | in0 = in[7]; |
michael@0 | 692 | in1 = in[0]; |
michael@0 | 693 | in2 = in[5]; |
michael@0 | 694 | in3 = in[2]; |
michael@0 | 695 | in4 = in[3]; |
michael@0 | 696 | in5 = in[4]; |
michael@0 | 697 | in6 = in[1]; |
michael@0 | 698 | in7 = in[6]; |
michael@0 | 699 | |
michael@0 | 700 | // column transformation |
michael@0 | 701 | // stage 1 |
michael@0 | 702 | // interleave and multiply/add into 32-bit integer |
michael@0 | 703 | s0 = _mm_unpacklo_epi16(in0, in1); |
michael@0 | 704 | s1 = _mm_unpackhi_epi16(in0, in1); |
michael@0 | 705 | s2 = _mm_unpacklo_epi16(in2, in3); |
michael@0 | 706 | s3 = _mm_unpackhi_epi16(in2, in3); |
michael@0 | 707 | s4 = _mm_unpacklo_epi16(in4, in5); |
michael@0 | 708 | s5 = _mm_unpackhi_epi16(in4, in5); |
michael@0 | 709 | s6 = _mm_unpacklo_epi16(in6, in7); |
michael@0 | 710 | s7 = _mm_unpackhi_epi16(in6, in7); |
michael@0 | 711 | |
michael@0 | 712 | u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); |
michael@0 | 713 | u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); |
michael@0 | 714 | u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); |
michael@0 | 715 | u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); |
michael@0 | 716 | u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); |
michael@0 | 717 | u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); |
michael@0 | 718 | u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); |
michael@0 | 719 | u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); |
michael@0 | 720 | u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); |
michael@0 | 721 | u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); |
michael@0 | 722 | u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); |
michael@0 | 723 | u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); |
michael@0 | 724 | u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); |
michael@0 | 725 | u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); |
michael@0 | 726 | u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); |
michael@0 | 727 | u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); |
michael@0 | 728 | |
michael@0 | 729 | // addition |
michael@0 | 730 | w0 = _mm_add_epi32(u0, u8); |
michael@0 | 731 | w1 = _mm_add_epi32(u1, u9); |
michael@0 | 732 | w2 = _mm_add_epi32(u2, u10); |
michael@0 | 733 | w3 = _mm_add_epi32(u3, u11); |
michael@0 | 734 | w4 = _mm_add_epi32(u4, u12); |
michael@0 | 735 | w5 = _mm_add_epi32(u5, u13); |
michael@0 | 736 | w6 = _mm_add_epi32(u6, u14); |
michael@0 | 737 | w7 = _mm_add_epi32(u7, u15); |
michael@0 | 738 | w8 = _mm_sub_epi32(u0, u8); |
michael@0 | 739 | w9 = _mm_sub_epi32(u1, u9); |
michael@0 | 740 | w10 = _mm_sub_epi32(u2, u10); |
michael@0 | 741 | w11 = _mm_sub_epi32(u3, u11); |
michael@0 | 742 | w12 = _mm_sub_epi32(u4, u12); |
michael@0 | 743 | w13 = _mm_sub_epi32(u5, u13); |
michael@0 | 744 | w14 = _mm_sub_epi32(u6, u14); |
michael@0 | 745 | w15 = _mm_sub_epi32(u7, u15); |
michael@0 | 746 | |
michael@0 | 747 | // shift and rounding |
michael@0 | 748 | v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); |
michael@0 | 749 | v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); |
michael@0 | 750 | v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); |
michael@0 | 751 | v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); |
michael@0 | 752 | v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); |
michael@0 | 753 | v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); |
michael@0 | 754 | v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); |
michael@0 | 755 | v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); |
michael@0 | 756 | v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); |
michael@0 | 757 | v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); |
michael@0 | 758 | v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); |
michael@0 | 759 | v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); |
michael@0 | 760 | v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); |
michael@0 | 761 | v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); |
michael@0 | 762 | v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); |
michael@0 | 763 | v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); |
michael@0 | 764 | |
michael@0 | 765 | u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); |
michael@0 | 766 | u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); |
michael@0 | 767 | u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); |
michael@0 | 768 | u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); |
michael@0 | 769 | u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); |
michael@0 | 770 | u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); |
michael@0 | 771 | u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); |
michael@0 | 772 | u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); |
michael@0 | 773 | u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); |
michael@0 | 774 | u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); |
michael@0 | 775 | u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); |
michael@0 | 776 | u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); |
michael@0 | 777 | u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); |
michael@0 | 778 | u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); |
michael@0 | 779 | u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); |
michael@0 | 780 | u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); |
michael@0 | 781 | |
michael@0 | 782 | // back to 16-bit and pack 8 integers into __m128i |
michael@0 | 783 | in[0] = _mm_packs_epi32(u0, u1); |
michael@0 | 784 | in[1] = _mm_packs_epi32(u2, u3); |
michael@0 | 785 | in[2] = _mm_packs_epi32(u4, u5); |
michael@0 | 786 | in[3] = _mm_packs_epi32(u6, u7); |
michael@0 | 787 | in[4] = _mm_packs_epi32(u8, u9); |
michael@0 | 788 | in[5] = _mm_packs_epi32(u10, u11); |
michael@0 | 789 | in[6] = _mm_packs_epi32(u12, u13); |
michael@0 | 790 | in[7] = _mm_packs_epi32(u14, u15); |
michael@0 | 791 | |
michael@0 | 792 | // stage 2 |
michael@0 | 793 | s0 = _mm_add_epi16(in[0], in[2]); |
michael@0 | 794 | s1 = _mm_add_epi16(in[1], in[3]); |
michael@0 | 795 | s2 = _mm_sub_epi16(in[0], in[2]); |
michael@0 | 796 | s3 = _mm_sub_epi16(in[1], in[3]); |
michael@0 | 797 | u0 = _mm_unpacklo_epi16(in[4], in[5]); |
michael@0 | 798 | u1 = _mm_unpackhi_epi16(in[4], in[5]); |
michael@0 | 799 | u2 = _mm_unpacklo_epi16(in[6], in[7]); |
michael@0 | 800 | u3 = _mm_unpackhi_epi16(in[6], in[7]); |
michael@0 | 801 | |
michael@0 | 802 | v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); |
michael@0 | 803 | v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); |
michael@0 | 804 | v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); |
michael@0 | 805 | v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); |
michael@0 | 806 | v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); |
michael@0 | 807 | v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); |
michael@0 | 808 | v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); |
michael@0 | 809 | v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); |
michael@0 | 810 | |
michael@0 | 811 | w0 = _mm_add_epi32(v0, v4); |
michael@0 | 812 | w1 = _mm_add_epi32(v1, v5); |
michael@0 | 813 | w2 = _mm_add_epi32(v2, v6); |
michael@0 | 814 | w3 = _mm_add_epi32(v3, v7); |
michael@0 | 815 | w4 = _mm_sub_epi32(v0, v4); |
michael@0 | 816 | w5 = _mm_sub_epi32(v1, v5); |
michael@0 | 817 | w6 = _mm_sub_epi32(v2, v6); |
michael@0 | 818 | w7 = _mm_sub_epi32(v3, v7); |
michael@0 | 819 | |
michael@0 | 820 | v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); |
michael@0 | 821 | v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); |
michael@0 | 822 | v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); |
michael@0 | 823 | v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); |
michael@0 | 824 | v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); |
michael@0 | 825 | v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); |
michael@0 | 826 | v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); |
michael@0 | 827 | v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); |
michael@0 | 828 | |
michael@0 | 829 | u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); |
michael@0 | 830 | u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); |
michael@0 | 831 | u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); |
michael@0 | 832 | u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); |
michael@0 | 833 | u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); |
michael@0 | 834 | u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); |
michael@0 | 835 | u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); |
michael@0 | 836 | u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); |
michael@0 | 837 | |
michael@0 | 838 | // back to 16-bit intergers |
michael@0 | 839 | s4 = _mm_packs_epi32(u0, u1); |
michael@0 | 840 | s5 = _mm_packs_epi32(u2, u3); |
michael@0 | 841 | s6 = _mm_packs_epi32(u4, u5); |
michael@0 | 842 | s7 = _mm_packs_epi32(u6, u7); |
michael@0 | 843 | |
michael@0 | 844 | // stage 3 |
michael@0 | 845 | u0 = _mm_unpacklo_epi16(s2, s3); |
michael@0 | 846 | u1 = _mm_unpackhi_epi16(s2, s3); |
michael@0 | 847 | u2 = _mm_unpacklo_epi16(s6, s7); |
michael@0 | 848 | u3 = _mm_unpackhi_epi16(s6, s7); |
michael@0 | 849 | |
michael@0 | 850 | v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); |
michael@0 | 851 | v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); |
michael@0 | 852 | v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); |
michael@0 | 853 | v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); |
michael@0 | 854 | v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); |
michael@0 | 855 | v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); |
michael@0 | 856 | v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); |
michael@0 | 857 | v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); |
michael@0 | 858 | |
michael@0 | 859 | u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); |
michael@0 | 860 | u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); |
michael@0 | 861 | u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); |
michael@0 | 862 | u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); |
michael@0 | 863 | u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); |
michael@0 | 864 | u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); |
michael@0 | 865 | u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); |
michael@0 | 866 | u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); |
michael@0 | 867 | |
michael@0 | 868 | v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); |
michael@0 | 869 | v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); |
michael@0 | 870 | v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); |
michael@0 | 871 | v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); |
michael@0 | 872 | v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); |
michael@0 | 873 | v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); |
michael@0 | 874 | v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); |
michael@0 | 875 | v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); |
michael@0 | 876 | |
michael@0 | 877 | s2 = _mm_packs_epi32(v0, v1); |
michael@0 | 878 | s3 = _mm_packs_epi32(v2, v3); |
michael@0 | 879 | s6 = _mm_packs_epi32(v4, v5); |
michael@0 | 880 | s7 = _mm_packs_epi32(v6, v7); |
michael@0 | 881 | |
michael@0 | 882 | in[0] = s0; |
michael@0 | 883 | in[1] = _mm_sub_epi16(k__const_0, s4); |
michael@0 | 884 | in[2] = s6; |
michael@0 | 885 | in[3] = _mm_sub_epi16(k__const_0, s2); |
michael@0 | 886 | in[4] = s3; |
michael@0 | 887 | in[5] = _mm_sub_epi16(k__const_0, s7); |
michael@0 | 888 | in[6] = s5; |
michael@0 | 889 | in[7] = _mm_sub_epi16(k__const_0, s1); |
michael@0 | 890 | } |
michael@0 | 891 | |
michael@0 | 892 | |
michael@0 | 893 | void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
michael@0 | 894 | int tx_type) { |
michael@0 | 895 | __m128i in[8]; |
michael@0 | 896 | const __m128i zero = _mm_setzero_si128(); |
michael@0 | 897 | const __m128i final_rounding = _mm_set1_epi16(1<<4); |
michael@0 | 898 | |
michael@0 | 899 | // load input data |
michael@0 | 900 | in[0] = _mm_load_si128((const __m128i *)input); |
michael@0 | 901 | in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
michael@0 | 902 | in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
michael@0 | 903 | in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
michael@0 | 904 | in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
michael@0 | 905 | in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
michael@0 | 906 | in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
michael@0 | 907 | in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
michael@0 | 908 | |
michael@0 | 909 | switch (tx_type) { |
michael@0 | 910 | case 0: // DCT_DCT |
michael@0 | 911 | idct8_1d_sse2(in); |
michael@0 | 912 | idct8_1d_sse2(in); |
michael@0 | 913 | break; |
michael@0 | 914 | case 1: // ADST_DCT |
michael@0 | 915 | idct8_1d_sse2(in); |
michael@0 | 916 | iadst8_1d_sse2(in); |
michael@0 | 917 | break; |
michael@0 | 918 | case 2: // DCT_ADST |
michael@0 | 919 | iadst8_1d_sse2(in); |
michael@0 | 920 | idct8_1d_sse2(in); |
michael@0 | 921 | break; |
michael@0 | 922 | case 3: // ADST_ADST |
michael@0 | 923 | iadst8_1d_sse2(in); |
michael@0 | 924 | iadst8_1d_sse2(in); |
michael@0 | 925 | break; |
michael@0 | 926 | default: |
michael@0 | 927 | assert(0); |
michael@0 | 928 | break; |
michael@0 | 929 | } |
michael@0 | 930 | |
michael@0 | 931 | // Final rounding and shift |
michael@0 | 932 | in[0] = _mm_adds_epi16(in[0], final_rounding); |
michael@0 | 933 | in[1] = _mm_adds_epi16(in[1], final_rounding); |
michael@0 | 934 | in[2] = _mm_adds_epi16(in[2], final_rounding); |
michael@0 | 935 | in[3] = _mm_adds_epi16(in[3], final_rounding); |
michael@0 | 936 | in[4] = _mm_adds_epi16(in[4], final_rounding); |
michael@0 | 937 | in[5] = _mm_adds_epi16(in[5], final_rounding); |
michael@0 | 938 | in[6] = _mm_adds_epi16(in[6], final_rounding); |
michael@0 | 939 | in[7] = _mm_adds_epi16(in[7], final_rounding); |
michael@0 | 940 | |
michael@0 | 941 | in[0] = _mm_srai_epi16(in[0], 5); |
michael@0 | 942 | in[1] = _mm_srai_epi16(in[1], 5); |
michael@0 | 943 | in[2] = _mm_srai_epi16(in[2], 5); |
michael@0 | 944 | in[3] = _mm_srai_epi16(in[3], 5); |
michael@0 | 945 | in[4] = _mm_srai_epi16(in[4], 5); |
michael@0 | 946 | in[5] = _mm_srai_epi16(in[5], 5); |
michael@0 | 947 | in[6] = _mm_srai_epi16(in[6], 5); |
michael@0 | 948 | in[7] = _mm_srai_epi16(in[7], 5); |
michael@0 | 949 | |
michael@0 | 950 | RECON_AND_STORE(dest, in[0]); |
michael@0 | 951 | RECON_AND_STORE(dest, in[1]); |
michael@0 | 952 | RECON_AND_STORE(dest, in[2]); |
michael@0 | 953 | RECON_AND_STORE(dest, in[3]); |
michael@0 | 954 | RECON_AND_STORE(dest, in[4]); |
michael@0 | 955 | RECON_AND_STORE(dest, in[5]); |
michael@0 | 956 | RECON_AND_STORE(dest, in[6]); |
michael@0 | 957 | RECON_AND_STORE(dest, in[7]); |
michael@0 | 958 | } |
michael@0 | 959 | |
michael@0 | 960 | void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
michael@0 | 961 | const __m128i zero = _mm_setzero_si128(); |
michael@0 | 962 | const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
michael@0 | 963 | const __m128i final_rounding = _mm_set1_epi16(1<<4); |
michael@0 | 964 | const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
michael@0 | 965 | const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
michael@0 | 966 | const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
michael@0 | 967 | const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
michael@0 | 968 | const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
michael@0 | 969 | const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
michael@0 | 970 | const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
michael@0 | 971 | const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
michael@0 | 972 | const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
michael@0 | 973 | |
michael@0 | 974 | __m128i in0, in1, in2, in3, in4, in5, in6, in7; |
michael@0 | 975 | __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; |
michael@0 | 976 | __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; |
michael@0 | 977 | __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
michael@0 | 978 | |
michael@0 | 979 | // Rows. Load 4-row input data. |
michael@0 | 980 | in0 = _mm_load_si128((const __m128i *)input); |
michael@0 | 981 | in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
michael@0 | 982 | in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
michael@0 | 983 | in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
michael@0 | 984 | |
michael@0 | 985 | // 8x4 Transpose |
michael@0 | 986 | TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3) |
michael@0 | 987 | |
michael@0 | 988 | // Stage1 |
michael@0 | 989 | { //NOLINT |
michael@0 | 990 | const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3); |
michael@0 | 991 | const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2); |
michael@0 | 992 | |
michael@0 | 993 | tmp0 = _mm_madd_epi16(lo_17, stg1_0); |
michael@0 | 994 | tmp2 = _mm_madd_epi16(lo_17, stg1_1); |
michael@0 | 995 | tmp4 = _mm_madd_epi16(lo_35, stg1_2); |
michael@0 | 996 | tmp6 = _mm_madd_epi16(lo_35, stg1_3); |
michael@0 | 997 | |
michael@0 | 998 | tmp0 = _mm_add_epi32(tmp0, rounding); |
michael@0 | 999 | tmp2 = _mm_add_epi32(tmp2, rounding); |
michael@0 | 1000 | tmp4 = _mm_add_epi32(tmp4, rounding); |
michael@0 | 1001 | tmp6 = _mm_add_epi32(tmp6, rounding); |
michael@0 | 1002 | tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
michael@0 | 1003 | tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
michael@0 | 1004 | tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); |
michael@0 | 1005 | tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); |
michael@0 | 1006 | |
michael@0 | 1007 | stp1_4 = _mm_packs_epi32(tmp0, zero); |
michael@0 | 1008 | stp1_7 = _mm_packs_epi32(tmp2, zero); |
michael@0 | 1009 | stp1_5 = _mm_packs_epi32(tmp4, zero); |
michael@0 | 1010 | stp1_6 = _mm_packs_epi32(tmp6, zero); |
michael@0 | 1011 | } |
michael@0 | 1012 | |
michael@0 | 1013 | // Stage2 |
michael@0 | 1014 | { //NOLINT |
michael@0 | 1015 | const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2); |
michael@0 | 1016 | const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3); |
michael@0 | 1017 | |
michael@0 | 1018 | tmp0 = _mm_madd_epi16(lo_04, stg2_0); |
michael@0 | 1019 | tmp2 = _mm_madd_epi16(lo_04, stg2_1); |
michael@0 | 1020 | tmp4 = _mm_madd_epi16(lo_26, stg2_2); |
michael@0 | 1021 | tmp6 = _mm_madd_epi16(lo_26, stg2_3); |
michael@0 | 1022 | |
michael@0 | 1023 | tmp0 = _mm_add_epi32(tmp0, rounding); |
michael@0 | 1024 | tmp2 = _mm_add_epi32(tmp2, rounding); |
michael@0 | 1025 | tmp4 = _mm_add_epi32(tmp4, rounding); |
michael@0 | 1026 | tmp6 = _mm_add_epi32(tmp6, rounding); |
michael@0 | 1027 | tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
michael@0 | 1028 | tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
michael@0 | 1029 | tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); |
michael@0 | 1030 | tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); |
michael@0 | 1031 | |
michael@0 | 1032 | stp2_0 = _mm_packs_epi32(tmp0, zero); |
michael@0 | 1033 | stp2_1 = _mm_packs_epi32(tmp2, zero); |
michael@0 | 1034 | stp2_2 = _mm_packs_epi32(tmp4, zero); |
michael@0 | 1035 | stp2_3 = _mm_packs_epi32(tmp6, zero); |
michael@0 | 1036 | |
michael@0 | 1037 | stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); |
michael@0 | 1038 | stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); |
michael@0 | 1039 | stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); |
michael@0 | 1040 | stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); |
michael@0 | 1041 | } |
michael@0 | 1042 | |
michael@0 | 1043 | // Stage3 |
michael@0 | 1044 | { //NOLINT |
michael@0 | 1045 | const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); |
michael@0 | 1046 | stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); |
michael@0 | 1047 | stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); |
michael@0 | 1048 | stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); |
michael@0 | 1049 | stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); |
michael@0 | 1050 | |
michael@0 | 1051 | tmp0 = _mm_madd_epi16(lo_56, stg3_0); |
michael@0 | 1052 | tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 |
michael@0 | 1053 | |
michael@0 | 1054 | tmp0 = _mm_add_epi32(tmp0, rounding); |
michael@0 | 1055 | tmp2 = _mm_add_epi32(tmp2, rounding); |
michael@0 | 1056 | tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
michael@0 | 1057 | tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
michael@0 | 1058 | |
michael@0 | 1059 | stp1_5 = _mm_packs_epi32(tmp0, zero); |
michael@0 | 1060 | stp1_6 = _mm_packs_epi32(tmp2, zero); |
michael@0 | 1061 | } |
michael@0 | 1062 | |
michael@0 | 1063 | // Stage4 |
michael@0 | 1064 | in0 = _mm_adds_epi16(stp1_0, stp2_7); |
michael@0 | 1065 | in1 = _mm_adds_epi16(stp1_1, stp1_6); |
michael@0 | 1066 | in2 = _mm_adds_epi16(stp1_2, stp1_5); |
michael@0 | 1067 | in3 = _mm_adds_epi16(stp1_3, stp2_4); |
michael@0 | 1068 | in4 = _mm_subs_epi16(stp1_3, stp2_4); |
michael@0 | 1069 | in5 = _mm_subs_epi16(stp1_2, stp1_5); |
michael@0 | 1070 | in6 = _mm_subs_epi16(stp1_1, stp1_6); |
michael@0 | 1071 | in7 = _mm_subs_epi16(stp1_0, stp2_7); |
michael@0 | 1072 | |
michael@0 | 1073 | // Columns. 4x8 Transpose |
michael@0 | 1074 | TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, |
michael@0 | 1075 | in4, in5, in6, in7) |
michael@0 | 1076 | |
michael@0 | 1077 | // 1D idct8x8 |
michael@0 | 1078 | IDCT8_1D |
michael@0 | 1079 | |
michael@0 | 1080 | // Final rounding and shift |
michael@0 | 1081 | in0 = _mm_adds_epi16(in0, final_rounding); |
michael@0 | 1082 | in1 = _mm_adds_epi16(in1, final_rounding); |
michael@0 | 1083 | in2 = _mm_adds_epi16(in2, final_rounding); |
michael@0 | 1084 | in3 = _mm_adds_epi16(in3, final_rounding); |
michael@0 | 1085 | in4 = _mm_adds_epi16(in4, final_rounding); |
michael@0 | 1086 | in5 = _mm_adds_epi16(in5, final_rounding); |
michael@0 | 1087 | in6 = _mm_adds_epi16(in6, final_rounding); |
michael@0 | 1088 | in7 = _mm_adds_epi16(in7, final_rounding); |
michael@0 | 1089 | |
michael@0 | 1090 | in0 = _mm_srai_epi16(in0, 5); |
michael@0 | 1091 | in1 = _mm_srai_epi16(in1, 5); |
michael@0 | 1092 | in2 = _mm_srai_epi16(in2, 5); |
michael@0 | 1093 | in3 = _mm_srai_epi16(in3, 5); |
michael@0 | 1094 | in4 = _mm_srai_epi16(in4, 5); |
michael@0 | 1095 | in5 = _mm_srai_epi16(in5, 5); |
michael@0 | 1096 | in6 = _mm_srai_epi16(in6, 5); |
michael@0 | 1097 | in7 = _mm_srai_epi16(in7, 5); |
michael@0 | 1098 | |
michael@0 | 1099 | RECON_AND_STORE(dest, in0); |
michael@0 | 1100 | RECON_AND_STORE(dest, in1); |
michael@0 | 1101 | RECON_AND_STORE(dest, in2); |
michael@0 | 1102 | RECON_AND_STORE(dest, in3); |
michael@0 | 1103 | RECON_AND_STORE(dest, in4); |
michael@0 | 1104 | RECON_AND_STORE(dest, in5); |
michael@0 | 1105 | RECON_AND_STORE(dest, in6); |
michael@0 | 1106 | RECON_AND_STORE(dest, in7); |
michael@0 | 1107 | } |
michael@0 | 1108 | |
michael@0 | 1109 | #define IDCT16_1D \ |
michael@0 | 1110 | /* Stage2 */ \ |
michael@0 | 1111 | { \ |
michael@0 | 1112 | const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ |
michael@0 | 1113 | const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \ |
michael@0 | 1114 | const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \ |
michael@0 | 1115 | const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \ |
michael@0 | 1116 | const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \ |
michael@0 | 1117 | const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \ |
michael@0 | 1118 | const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \ |
michael@0 | 1119 | const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \ |
michael@0 | 1120 | \ |
michael@0 | 1121 | MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ |
michael@0 | 1122 | stg2_0, stg2_1, stg2_2, stg2_3, \ |
michael@0 | 1123 | stp2_8, stp2_15, stp2_9, stp2_14) \ |
michael@0 | 1124 | \ |
michael@0 | 1125 | MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ |
michael@0 | 1126 | stg2_4, stg2_5, stg2_6, stg2_7, \ |
michael@0 | 1127 | stp2_10, stp2_13, stp2_11, stp2_12) \ |
michael@0 | 1128 | } \ |
michael@0 | 1129 | \ |
michael@0 | 1130 | /* Stage3 */ \ |
michael@0 | 1131 | { \ |
michael@0 | 1132 | const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \ |
michael@0 | 1133 | const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \ |
michael@0 | 1134 | const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \ |
michael@0 | 1135 | const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \ |
michael@0 | 1136 | \ |
michael@0 | 1137 | MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ |
michael@0 | 1138 | stg3_0, stg3_1, stg3_2, stg3_3, \ |
michael@0 | 1139 | stp1_4, stp1_7, stp1_5, stp1_6) \ |
michael@0 | 1140 | \ |
michael@0 | 1141 | stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ |
michael@0 | 1142 | stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ |
michael@0 | 1143 | stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ |
michael@0 | 1144 | stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ |
michael@0 | 1145 | \ |
michael@0 | 1146 | stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ |
michael@0 | 1147 | stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ |
michael@0 | 1148 | stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ |
michael@0 | 1149 | stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ |
michael@0 | 1150 | } \ |
michael@0 | 1151 | \ |
michael@0 | 1152 | /* Stage4 */ \ |
michael@0 | 1153 | { \ |
michael@0 | 1154 | const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \ |
michael@0 | 1155 | const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \ |
michael@0 | 1156 | const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \ |
michael@0 | 1157 | const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \ |
michael@0 | 1158 | \ |
michael@0 | 1159 | const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ |
michael@0 | 1160 | const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ |
michael@0 | 1161 | const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ |
michael@0 | 1162 | const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ |
michael@0 | 1163 | \ |
michael@0 | 1164 | MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ |
michael@0 | 1165 | stg4_0, stg4_1, stg4_2, stg4_3, \ |
michael@0 | 1166 | stp2_0, stp2_1, stp2_2, stp2_3) \ |
michael@0 | 1167 | \ |
michael@0 | 1168 | stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ |
michael@0 | 1169 | stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ |
michael@0 | 1170 | stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ |
michael@0 | 1171 | stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ |
michael@0 | 1172 | \ |
michael@0 | 1173 | MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ |
michael@0 | 1174 | stg4_4, stg4_5, stg4_6, stg4_7, \ |
michael@0 | 1175 | stp2_9, stp2_14, stp2_10, stp2_13) \ |
michael@0 | 1176 | } \ |
michael@0 | 1177 | \ |
michael@0 | 1178 | /* Stage5 */ \ |
michael@0 | 1179 | { \ |
michael@0 | 1180 | const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ |
michael@0 | 1181 | const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ |
michael@0 | 1182 | \ |
michael@0 | 1183 | stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ |
michael@0 | 1184 | stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ |
michael@0 | 1185 | stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ |
michael@0 | 1186 | stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ |
michael@0 | 1187 | \ |
michael@0 | 1188 | tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ |
michael@0 | 1189 | tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ |
michael@0 | 1190 | tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ |
michael@0 | 1191 | tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ |
michael@0 | 1192 | \ |
michael@0 | 1193 | tmp0 = _mm_add_epi32(tmp0, rounding); \ |
michael@0 | 1194 | tmp1 = _mm_add_epi32(tmp1, rounding); \ |
michael@0 | 1195 | tmp2 = _mm_add_epi32(tmp2, rounding); \ |
michael@0 | 1196 | tmp3 = _mm_add_epi32(tmp3, rounding); \ |
michael@0 | 1197 | \ |
michael@0 | 1198 | tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ |
michael@0 | 1199 | tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ |
michael@0 | 1200 | tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ |
michael@0 | 1201 | tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ |
michael@0 | 1202 | \ |
michael@0 | 1203 | stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ |
michael@0 | 1204 | stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ |
michael@0 | 1205 | \ |
michael@0 | 1206 | stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ |
michael@0 | 1207 | stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ |
michael@0 | 1208 | stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ |
michael@0 | 1209 | stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ |
michael@0 | 1210 | \ |
michael@0 | 1211 | stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ |
michael@0 | 1212 | stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ |
michael@0 | 1213 | stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ |
michael@0 | 1214 | stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ |
michael@0 | 1215 | } \ |
michael@0 | 1216 | \ |
michael@0 | 1217 | /* Stage6 */ \ |
michael@0 | 1218 | { \ |
michael@0 | 1219 | const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ |
michael@0 | 1220 | const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ |
michael@0 | 1221 | const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ |
michael@0 | 1222 | const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ |
michael@0 | 1223 | \ |
michael@0 | 1224 | stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ |
michael@0 | 1225 | stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ |
michael@0 | 1226 | stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ |
michael@0 | 1227 | stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ |
michael@0 | 1228 | stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ |
michael@0 | 1229 | stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ |
michael@0 | 1230 | stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ |
michael@0 | 1231 | stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ |
michael@0 | 1232 | \ |
michael@0 | 1233 | MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ |
michael@0 | 1234 | stg6_0, stg4_0, stg6_0, stg4_0, \ |
michael@0 | 1235 | stp2_10, stp2_13, stp2_11, stp2_12) \ |
michael@0 | 1236 | } |
michael@0 | 1237 | |
michael@0 | 1238 | void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, |
michael@0 | 1239 | int stride) { |
michael@0 | 1240 | const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
michael@0 | 1241 | const __m128i final_rounding = _mm_set1_epi16(1<<5); |
michael@0 | 1242 | const __m128i zero = _mm_setzero_si128(); |
michael@0 | 1243 | |
michael@0 | 1244 | const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
michael@0 | 1245 | const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); |
michael@0 | 1246 | const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
michael@0 | 1247 | const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); |
michael@0 | 1248 | const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
michael@0 | 1249 | const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); |
michael@0 | 1250 | const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
michael@0 | 1251 | const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); |
michael@0 | 1252 | |
michael@0 | 1253 | const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
michael@0 | 1254 | const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
michael@0 | 1255 | const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); |
michael@0 | 1256 | const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); |
michael@0 | 1257 | |
michael@0 | 1258 | const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
michael@0 | 1259 | const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
michael@0 | 1260 | const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
michael@0 | 1261 | const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
michael@0 | 1262 | const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
michael@0 | 1263 | const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); |
michael@0 | 1264 | const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
michael@0 | 1265 | const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
michael@0 | 1266 | |
michael@0 | 1267 | const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
michael@0 | 1268 | |
michael@0 | 1269 | __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, |
michael@0 | 1270 | in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, |
michael@0 | 1271 | in10 = zero, in11 = zero, in12 = zero, in13 = zero, |
michael@0 | 1272 | in14 = zero, in15 = zero; |
michael@0 | 1273 | __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, |
michael@0 | 1274 | l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, |
michael@0 | 1275 | l12 = zero, l13 = zero, l14 = zero, l15 = zero; |
michael@0 | 1276 | __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero, |
michael@0 | 1277 | r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero, |
michael@0 | 1278 | r12 = zero, r13 = zero, r14 = zero, r15 = zero; |
michael@0 | 1279 | __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, |
michael@0 | 1280 | stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, |
michael@0 | 1281 | stp1_8_0, stp1_12_0; |
michael@0 | 1282 | __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
michael@0 | 1283 | stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; |
michael@0 | 1284 | __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
michael@0 | 1285 | int i; |
michael@0 | 1286 | |
michael@0 | 1287 | // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct. |
michael@0 | 1288 | for (i = 0; i < 4; i++) { |
michael@0 | 1289 | // 1-D idct |
michael@0 | 1290 | if (i < 2) { |
michael@0 | 1291 | if (i == 1) input += 128; |
michael@0 | 1292 | |
michael@0 | 1293 | // Load input data. |
michael@0 | 1294 | in0 = _mm_load_si128((const __m128i *)input); |
michael@0 | 1295 | in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
michael@0 | 1296 | in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
michael@0 | 1297 | in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
michael@0 | 1298 | in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
michael@0 | 1299 | in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
michael@0 | 1300 | in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
michael@0 | 1301 | in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
michael@0 | 1302 | in4 = _mm_load_si128((const __m128i *)(input + 8 * 8)); |
michael@0 | 1303 | in12 = _mm_load_si128((const __m128i *)(input + 8 * 9)); |
michael@0 | 1304 | in5 = _mm_load_si128((const __m128i *)(input + 8 * 10)); |
michael@0 | 1305 | in13 = _mm_load_si128((const __m128i *)(input + 8 * 11)); |
michael@0 | 1306 | in6 = _mm_load_si128((const __m128i *)(input + 8 * 12)); |
michael@0 | 1307 | in14 = _mm_load_si128((const __m128i *)(input + 8 * 13)); |
michael@0 | 1308 | in7 = _mm_load_si128((const __m128i *)(input + 8 * 14)); |
michael@0 | 1309 | in15 = _mm_load_si128((const __m128i *)(input + 8 * 15)); |
michael@0 | 1310 | |
michael@0 | 1311 | TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, |
michael@0 | 1312 | in4, in5, in6, in7); |
michael@0 | 1313 | TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, |
michael@0 | 1314 | in10, in11, in12, in13, in14, in15); |
michael@0 | 1315 | } |
michael@0 | 1316 | |
michael@0 | 1317 | if (i == 2) { |
michael@0 | 1318 | TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, |
michael@0 | 1319 | in5, in6, in7); |
michael@0 | 1320 | TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12, |
michael@0 | 1321 | in13, in14, in15); |
michael@0 | 1322 | } |
michael@0 | 1323 | |
michael@0 | 1324 | if (i == 3) { |
michael@0 | 1325 | TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, |
michael@0 | 1326 | in4, in5, in6, in7); |
michael@0 | 1327 | TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11, |
michael@0 | 1328 | in12, in13, in14, in15); |
michael@0 | 1329 | } |
michael@0 | 1330 | |
michael@0 | 1331 | IDCT16_1D |
michael@0 | 1332 | |
michael@0 | 1333 | // Stage7 |
michael@0 | 1334 | if (i == 0) { |
michael@0 | 1335 | // Left 8x16 |
michael@0 | 1336 | l0 = _mm_add_epi16(stp2_0, stp1_15); |
michael@0 | 1337 | l1 = _mm_add_epi16(stp2_1, stp1_14); |
michael@0 | 1338 | l2 = _mm_add_epi16(stp2_2, stp2_13); |
michael@0 | 1339 | l3 = _mm_add_epi16(stp2_3, stp2_12); |
michael@0 | 1340 | l4 = _mm_add_epi16(stp2_4, stp2_11); |
michael@0 | 1341 | l5 = _mm_add_epi16(stp2_5, stp2_10); |
michael@0 | 1342 | l6 = _mm_add_epi16(stp2_6, stp1_9); |
michael@0 | 1343 | l7 = _mm_add_epi16(stp2_7, stp1_8); |
michael@0 | 1344 | l8 = _mm_sub_epi16(stp2_7, stp1_8); |
michael@0 | 1345 | l9 = _mm_sub_epi16(stp2_6, stp1_9); |
michael@0 | 1346 | l10 = _mm_sub_epi16(stp2_5, stp2_10); |
michael@0 | 1347 | l11 = _mm_sub_epi16(stp2_4, stp2_11); |
michael@0 | 1348 | l12 = _mm_sub_epi16(stp2_3, stp2_12); |
michael@0 | 1349 | l13 = _mm_sub_epi16(stp2_2, stp2_13); |
michael@0 | 1350 | l14 = _mm_sub_epi16(stp2_1, stp1_14); |
michael@0 | 1351 | l15 = _mm_sub_epi16(stp2_0, stp1_15); |
michael@0 | 1352 | } else if (i == 1) { |
michael@0 | 1353 | // Right 8x16 |
michael@0 | 1354 | r0 = _mm_add_epi16(stp2_0, stp1_15); |
michael@0 | 1355 | r1 = _mm_add_epi16(stp2_1, stp1_14); |
michael@0 | 1356 | r2 = _mm_add_epi16(stp2_2, stp2_13); |
michael@0 | 1357 | r3 = _mm_add_epi16(stp2_3, stp2_12); |
michael@0 | 1358 | r4 = _mm_add_epi16(stp2_4, stp2_11); |
michael@0 | 1359 | r5 = _mm_add_epi16(stp2_5, stp2_10); |
michael@0 | 1360 | r6 = _mm_add_epi16(stp2_6, stp1_9); |
michael@0 | 1361 | r7 = _mm_add_epi16(stp2_7, stp1_8); |
michael@0 | 1362 | r8 = _mm_sub_epi16(stp2_7, stp1_8); |
michael@0 | 1363 | r9 = _mm_sub_epi16(stp2_6, stp1_9); |
michael@0 | 1364 | r10 = _mm_sub_epi16(stp2_5, stp2_10); |
michael@0 | 1365 | r11 = _mm_sub_epi16(stp2_4, stp2_11); |
michael@0 | 1366 | r12 = _mm_sub_epi16(stp2_3, stp2_12); |
michael@0 | 1367 | r13 = _mm_sub_epi16(stp2_2, stp2_13); |
michael@0 | 1368 | r14 = _mm_sub_epi16(stp2_1, stp1_14); |
michael@0 | 1369 | r15 = _mm_sub_epi16(stp2_0, stp1_15); |
michael@0 | 1370 | } else { |
michael@0 | 1371 | // 2-D |
michael@0 | 1372 | in0 = _mm_add_epi16(stp2_0, stp1_15); |
michael@0 | 1373 | in1 = _mm_add_epi16(stp2_1, stp1_14); |
michael@0 | 1374 | in2 = _mm_add_epi16(stp2_2, stp2_13); |
michael@0 | 1375 | in3 = _mm_add_epi16(stp2_3, stp2_12); |
michael@0 | 1376 | in4 = _mm_add_epi16(stp2_4, stp2_11); |
michael@0 | 1377 | in5 = _mm_add_epi16(stp2_5, stp2_10); |
michael@0 | 1378 | in6 = _mm_add_epi16(stp2_6, stp1_9); |
michael@0 | 1379 | in7 = _mm_add_epi16(stp2_7, stp1_8); |
michael@0 | 1380 | in8 = _mm_sub_epi16(stp2_7, stp1_8); |
michael@0 | 1381 | in9 = _mm_sub_epi16(stp2_6, stp1_9); |
michael@0 | 1382 | in10 = _mm_sub_epi16(stp2_5, stp2_10); |
michael@0 | 1383 | in11 = _mm_sub_epi16(stp2_4, stp2_11); |
michael@0 | 1384 | in12 = _mm_sub_epi16(stp2_3, stp2_12); |
michael@0 | 1385 | in13 = _mm_sub_epi16(stp2_2, stp2_13); |
michael@0 | 1386 | in14 = _mm_sub_epi16(stp2_1, stp1_14); |
michael@0 | 1387 | in15 = _mm_sub_epi16(stp2_0, stp1_15); |
michael@0 | 1388 | |
michael@0 | 1389 | // Final rounding and shift |
michael@0 | 1390 | in0 = _mm_adds_epi16(in0, final_rounding); |
michael@0 | 1391 | in1 = _mm_adds_epi16(in1, final_rounding); |
michael@0 | 1392 | in2 = _mm_adds_epi16(in2, final_rounding); |
michael@0 | 1393 | in3 = _mm_adds_epi16(in3, final_rounding); |
michael@0 | 1394 | in4 = _mm_adds_epi16(in4, final_rounding); |
michael@0 | 1395 | in5 = _mm_adds_epi16(in5, final_rounding); |
michael@0 | 1396 | in6 = _mm_adds_epi16(in6, final_rounding); |
michael@0 | 1397 | in7 = _mm_adds_epi16(in7, final_rounding); |
michael@0 | 1398 | in8 = _mm_adds_epi16(in8, final_rounding); |
michael@0 | 1399 | in9 = _mm_adds_epi16(in9, final_rounding); |
michael@0 | 1400 | in10 = _mm_adds_epi16(in10, final_rounding); |
michael@0 | 1401 | in11 = _mm_adds_epi16(in11, final_rounding); |
michael@0 | 1402 | in12 = _mm_adds_epi16(in12, final_rounding); |
michael@0 | 1403 | in13 = _mm_adds_epi16(in13, final_rounding); |
michael@0 | 1404 | in14 = _mm_adds_epi16(in14, final_rounding); |
michael@0 | 1405 | in15 = _mm_adds_epi16(in15, final_rounding); |
michael@0 | 1406 | |
michael@0 | 1407 | in0 = _mm_srai_epi16(in0, 6); |
michael@0 | 1408 | in1 = _mm_srai_epi16(in1, 6); |
michael@0 | 1409 | in2 = _mm_srai_epi16(in2, 6); |
michael@0 | 1410 | in3 = _mm_srai_epi16(in3, 6); |
michael@0 | 1411 | in4 = _mm_srai_epi16(in4, 6); |
michael@0 | 1412 | in5 = _mm_srai_epi16(in5, 6); |
michael@0 | 1413 | in6 = _mm_srai_epi16(in6, 6); |
michael@0 | 1414 | in7 = _mm_srai_epi16(in7, 6); |
michael@0 | 1415 | in8 = _mm_srai_epi16(in8, 6); |
michael@0 | 1416 | in9 = _mm_srai_epi16(in9, 6); |
michael@0 | 1417 | in10 = _mm_srai_epi16(in10, 6); |
michael@0 | 1418 | in11 = _mm_srai_epi16(in11, 6); |
michael@0 | 1419 | in12 = _mm_srai_epi16(in12, 6); |
michael@0 | 1420 | in13 = _mm_srai_epi16(in13, 6); |
michael@0 | 1421 | in14 = _mm_srai_epi16(in14, 6); |
michael@0 | 1422 | in15 = _mm_srai_epi16(in15, 6); |
michael@0 | 1423 | |
michael@0 | 1424 | RECON_AND_STORE(dest, in0); |
michael@0 | 1425 | RECON_AND_STORE(dest, in1); |
michael@0 | 1426 | RECON_AND_STORE(dest, in2); |
michael@0 | 1427 | RECON_AND_STORE(dest, in3); |
michael@0 | 1428 | RECON_AND_STORE(dest, in4); |
michael@0 | 1429 | RECON_AND_STORE(dest, in5); |
michael@0 | 1430 | RECON_AND_STORE(dest, in6); |
michael@0 | 1431 | RECON_AND_STORE(dest, in7); |
michael@0 | 1432 | RECON_AND_STORE(dest, in8); |
michael@0 | 1433 | RECON_AND_STORE(dest, in9); |
michael@0 | 1434 | RECON_AND_STORE(dest, in10); |
michael@0 | 1435 | RECON_AND_STORE(dest, in11); |
michael@0 | 1436 | RECON_AND_STORE(dest, in12); |
michael@0 | 1437 | RECON_AND_STORE(dest, in13); |
michael@0 | 1438 | RECON_AND_STORE(dest, in14); |
michael@0 | 1439 | RECON_AND_STORE(dest, in15); |
michael@0 | 1440 | |
michael@0 | 1441 | dest += 8 - (stride * 16); |
michael@0 | 1442 | } |
michael@0 | 1443 | } |
michael@0 | 1444 | } |
michael@0 | 1445 | |
michael@0 | 1446 | void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
michael@0 | 1447 | __m128i dc_value; |
michael@0 | 1448 | const __m128i zero = _mm_setzero_si128(); |
michael@0 | 1449 | int a, i; |
michael@0 | 1450 | |
michael@0 | 1451 | a = dct_const_round_shift(input[0] * cospi_16_64); |
michael@0 | 1452 | a = dct_const_round_shift(a * cospi_16_64); |
michael@0 | 1453 | a = ROUND_POWER_OF_TWO(a, 6); |
michael@0 | 1454 | |
michael@0 | 1455 | dc_value = _mm_set1_epi16(a); |
michael@0 | 1456 | |
michael@0 | 1457 | for (i = 0; i < 2; ++i) { |
michael@0 | 1458 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 1459 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 1460 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 1461 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 1462 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 1463 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 1464 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 1465 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 1466 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 1467 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 1468 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 1469 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 1470 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 1471 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 1472 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 1473 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 1474 | dest += 8 - (stride * 16); |
michael@0 | 1475 | } |
michael@0 | 1476 | } |
michael@0 | 1477 | |
michael@0 | 1478 | static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { |
michael@0 | 1479 | __m128i tbuf[8]; |
michael@0 | 1480 | array_transpose_8x8(res0, res0); |
michael@0 | 1481 | array_transpose_8x8(res1, tbuf); |
michael@0 | 1482 | array_transpose_8x8(res0 + 8, res1); |
michael@0 | 1483 | array_transpose_8x8(res1 + 8, res1 + 8); |
michael@0 | 1484 | |
michael@0 | 1485 | res0[8] = tbuf[0]; |
michael@0 | 1486 | res0[9] = tbuf[1]; |
michael@0 | 1487 | res0[10] = tbuf[2]; |
michael@0 | 1488 | res0[11] = tbuf[3]; |
michael@0 | 1489 | res0[12] = tbuf[4]; |
michael@0 | 1490 | res0[13] = tbuf[5]; |
michael@0 | 1491 | res0[14] = tbuf[6]; |
michael@0 | 1492 | res0[15] = tbuf[7]; |
michael@0 | 1493 | } |
michael@0 | 1494 | |
michael@0 | 1495 | static void iadst16_1d_8col(__m128i *in) { |
michael@0 | 1496 | // perform 16x16 1-D ADST for 8 columns |
michael@0 | 1497 | __m128i s[16], x[16], u[32], v[32]; |
michael@0 | 1498 | const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); |
michael@0 | 1499 | const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
michael@0 | 1500 | const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); |
michael@0 | 1501 | const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); |
michael@0 | 1502 | const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); |
michael@0 | 1503 | const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); |
michael@0 | 1504 | const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); |
michael@0 | 1505 | const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); |
michael@0 | 1506 | const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); |
michael@0 | 1507 | const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); |
michael@0 | 1508 | const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); |
michael@0 | 1509 | const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); |
michael@0 | 1510 | const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); |
michael@0 | 1511 | const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); |
michael@0 | 1512 | const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); |
michael@0 | 1513 | const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); |
michael@0 | 1514 | const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); |
michael@0 | 1515 | const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
michael@0 | 1516 | const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); |
michael@0 | 1517 | const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); |
michael@0 | 1518 | const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); |
michael@0 | 1519 | const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); |
michael@0 | 1520 | const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
michael@0 | 1521 | const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
michael@0 | 1522 | const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); |
michael@0 | 1523 | const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); |
michael@0 | 1524 | const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
michael@0 | 1525 | const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
michael@0 | 1526 | const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
michael@0 | 1527 | const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
michael@0 | 1528 | const __m128i kZero = _mm_set1_epi16(0); |
michael@0 | 1529 | |
michael@0 | 1530 | u[0] = _mm_unpacklo_epi16(in[15], in[0]); |
michael@0 | 1531 | u[1] = _mm_unpackhi_epi16(in[15], in[0]); |
michael@0 | 1532 | u[2] = _mm_unpacklo_epi16(in[13], in[2]); |
michael@0 | 1533 | u[3] = _mm_unpackhi_epi16(in[13], in[2]); |
michael@0 | 1534 | u[4] = _mm_unpacklo_epi16(in[11], in[4]); |
michael@0 | 1535 | u[5] = _mm_unpackhi_epi16(in[11], in[4]); |
michael@0 | 1536 | u[6] = _mm_unpacklo_epi16(in[9], in[6]); |
michael@0 | 1537 | u[7] = _mm_unpackhi_epi16(in[9], in[6]); |
michael@0 | 1538 | u[8] = _mm_unpacklo_epi16(in[7], in[8]); |
michael@0 | 1539 | u[9] = _mm_unpackhi_epi16(in[7], in[8]); |
michael@0 | 1540 | u[10] = _mm_unpacklo_epi16(in[5], in[10]); |
michael@0 | 1541 | u[11] = _mm_unpackhi_epi16(in[5], in[10]); |
michael@0 | 1542 | u[12] = _mm_unpacklo_epi16(in[3], in[12]); |
michael@0 | 1543 | u[13] = _mm_unpackhi_epi16(in[3], in[12]); |
michael@0 | 1544 | u[14] = _mm_unpacklo_epi16(in[1], in[14]); |
michael@0 | 1545 | u[15] = _mm_unpackhi_epi16(in[1], in[14]); |
michael@0 | 1546 | |
michael@0 | 1547 | v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); |
michael@0 | 1548 | v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); |
michael@0 | 1549 | v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); |
michael@0 | 1550 | v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); |
michael@0 | 1551 | v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); |
michael@0 | 1552 | v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); |
michael@0 | 1553 | v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); |
michael@0 | 1554 | v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); |
michael@0 | 1555 | v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); |
michael@0 | 1556 | v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); |
michael@0 | 1557 | v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); |
michael@0 | 1558 | v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); |
michael@0 | 1559 | v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); |
michael@0 | 1560 | v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); |
michael@0 | 1561 | v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); |
michael@0 | 1562 | v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); |
michael@0 | 1563 | v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); |
michael@0 | 1564 | v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); |
michael@0 | 1565 | v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); |
michael@0 | 1566 | v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); |
michael@0 | 1567 | v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); |
michael@0 | 1568 | v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); |
michael@0 | 1569 | v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); |
michael@0 | 1570 | v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); |
michael@0 | 1571 | v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); |
michael@0 | 1572 | v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); |
michael@0 | 1573 | v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); |
michael@0 | 1574 | v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); |
michael@0 | 1575 | v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); |
michael@0 | 1576 | v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); |
michael@0 | 1577 | v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); |
michael@0 | 1578 | v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); |
michael@0 | 1579 | |
michael@0 | 1580 | u[0] = _mm_add_epi32(v[0], v[16]); |
michael@0 | 1581 | u[1] = _mm_add_epi32(v[1], v[17]); |
michael@0 | 1582 | u[2] = _mm_add_epi32(v[2], v[18]); |
michael@0 | 1583 | u[3] = _mm_add_epi32(v[3], v[19]); |
michael@0 | 1584 | u[4] = _mm_add_epi32(v[4], v[20]); |
michael@0 | 1585 | u[5] = _mm_add_epi32(v[5], v[21]); |
michael@0 | 1586 | u[6] = _mm_add_epi32(v[6], v[22]); |
michael@0 | 1587 | u[7] = _mm_add_epi32(v[7], v[23]); |
michael@0 | 1588 | u[8] = _mm_add_epi32(v[8], v[24]); |
michael@0 | 1589 | u[9] = _mm_add_epi32(v[9], v[25]); |
michael@0 | 1590 | u[10] = _mm_add_epi32(v[10], v[26]); |
michael@0 | 1591 | u[11] = _mm_add_epi32(v[11], v[27]); |
michael@0 | 1592 | u[12] = _mm_add_epi32(v[12], v[28]); |
michael@0 | 1593 | u[13] = _mm_add_epi32(v[13], v[29]); |
michael@0 | 1594 | u[14] = _mm_add_epi32(v[14], v[30]); |
michael@0 | 1595 | u[15] = _mm_add_epi32(v[15], v[31]); |
michael@0 | 1596 | u[16] = _mm_sub_epi32(v[0], v[16]); |
michael@0 | 1597 | u[17] = _mm_sub_epi32(v[1], v[17]); |
michael@0 | 1598 | u[18] = _mm_sub_epi32(v[2], v[18]); |
michael@0 | 1599 | u[19] = _mm_sub_epi32(v[3], v[19]); |
michael@0 | 1600 | u[20] = _mm_sub_epi32(v[4], v[20]); |
michael@0 | 1601 | u[21] = _mm_sub_epi32(v[5], v[21]); |
michael@0 | 1602 | u[22] = _mm_sub_epi32(v[6], v[22]); |
michael@0 | 1603 | u[23] = _mm_sub_epi32(v[7], v[23]); |
michael@0 | 1604 | u[24] = _mm_sub_epi32(v[8], v[24]); |
michael@0 | 1605 | u[25] = _mm_sub_epi32(v[9], v[25]); |
michael@0 | 1606 | u[26] = _mm_sub_epi32(v[10], v[26]); |
michael@0 | 1607 | u[27] = _mm_sub_epi32(v[11], v[27]); |
michael@0 | 1608 | u[28] = _mm_sub_epi32(v[12], v[28]); |
michael@0 | 1609 | u[29] = _mm_sub_epi32(v[13], v[29]); |
michael@0 | 1610 | u[30] = _mm_sub_epi32(v[14], v[30]); |
michael@0 | 1611 | u[31] = _mm_sub_epi32(v[15], v[31]); |
michael@0 | 1612 | |
michael@0 | 1613 | v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); |
michael@0 | 1614 | v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); |
michael@0 | 1615 | v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); |
michael@0 | 1616 | v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); |
michael@0 | 1617 | v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); |
michael@0 | 1618 | v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); |
michael@0 | 1619 | v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); |
michael@0 | 1620 | v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); |
michael@0 | 1621 | v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); |
michael@0 | 1622 | v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); |
michael@0 | 1623 | v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); |
michael@0 | 1624 | v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); |
michael@0 | 1625 | v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); |
michael@0 | 1626 | v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); |
michael@0 | 1627 | v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); |
michael@0 | 1628 | v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); |
michael@0 | 1629 | v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); |
michael@0 | 1630 | v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); |
michael@0 | 1631 | v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); |
michael@0 | 1632 | v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); |
michael@0 | 1633 | v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); |
michael@0 | 1634 | v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); |
michael@0 | 1635 | v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); |
michael@0 | 1636 | v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); |
michael@0 | 1637 | v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); |
michael@0 | 1638 | v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); |
michael@0 | 1639 | v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); |
michael@0 | 1640 | v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); |
michael@0 | 1641 | v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); |
michael@0 | 1642 | v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); |
michael@0 | 1643 | v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); |
michael@0 | 1644 | v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); |
michael@0 | 1645 | |
michael@0 | 1646 | u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
michael@0 | 1647 | u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
michael@0 | 1648 | u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
michael@0 | 1649 | u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
michael@0 | 1650 | u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); |
michael@0 | 1651 | u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); |
michael@0 | 1652 | u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); |
michael@0 | 1653 | u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); |
michael@0 | 1654 | u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); |
michael@0 | 1655 | u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); |
michael@0 | 1656 | u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); |
michael@0 | 1657 | u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); |
michael@0 | 1658 | u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); |
michael@0 | 1659 | u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); |
michael@0 | 1660 | u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); |
michael@0 | 1661 | u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); |
michael@0 | 1662 | u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); |
michael@0 | 1663 | u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); |
michael@0 | 1664 | u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); |
michael@0 | 1665 | u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); |
michael@0 | 1666 | u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); |
michael@0 | 1667 | u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); |
michael@0 | 1668 | u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); |
michael@0 | 1669 | u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); |
michael@0 | 1670 | u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); |
michael@0 | 1671 | u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); |
michael@0 | 1672 | u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); |
michael@0 | 1673 | u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); |
michael@0 | 1674 | u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); |
michael@0 | 1675 | u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); |
michael@0 | 1676 | u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); |
michael@0 | 1677 | u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); |
michael@0 | 1678 | |
michael@0 | 1679 | s[0] = _mm_packs_epi32(u[0], u[1]); |
michael@0 | 1680 | s[1] = _mm_packs_epi32(u[2], u[3]); |
michael@0 | 1681 | s[2] = _mm_packs_epi32(u[4], u[5]); |
michael@0 | 1682 | s[3] = _mm_packs_epi32(u[6], u[7]); |
michael@0 | 1683 | s[4] = _mm_packs_epi32(u[8], u[9]); |
michael@0 | 1684 | s[5] = _mm_packs_epi32(u[10], u[11]); |
michael@0 | 1685 | s[6] = _mm_packs_epi32(u[12], u[13]); |
michael@0 | 1686 | s[7] = _mm_packs_epi32(u[14], u[15]); |
michael@0 | 1687 | s[8] = _mm_packs_epi32(u[16], u[17]); |
michael@0 | 1688 | s[9] = _mm_packs_epi32(u[18], u[19]); |
michael@0 | 1689 | s[10] = _mm_packs_epi32(u[20], u[21]); |
michael@0 | 1690 | s[11] = _mm_packs_epi32(u[22], u[23]); |
michael@0 | 1691 | s[12] = _mm_packs_epi32(u[24], u[25]); |
michael@0 | 1692 | s[13] = _mm_packs_epi32(u[26], u[27]); |
michael@0 | 1693 | s[14] = _mm_packs_epi32(u[28], u[29]); |
michael@0 | 1694 | s[15] = _mm_packs_epi32(u[30], u[31]); |
michael@0 | 1695 | |
michael@0 | 1696 | // stage 2 |
michael@0 | 1697 | u[0] = _mm_unpacklo_epi16(s[8], s[9]); |
michael@0 | 1698 | u[1] = _mm_unpackhi_epi16(s[8], s[9]); |
michael@0 | 1699 | u[2] = _mm_unpacklo_epi16(s[10], s[11]); |
michael@0 | 1700 | u[3] = _mm_unpackhi_epi16(s[10], s[11]); |
michael@0 | 1701 | u[4] = _mm_unpacklo_epi16(s[12], s[13]); |
michael@0 | 1702 | u[5] = _mm_unpackhi_epi16(s[12], s[13]); |
michael@0 | 1703 | u[6] = _mm_unpacklo_epi16(s[14], s[15]); |
michael@0 | 1704 | u[7] = _mm_unpackhi_epi16(s[14], s[15]); |
michael@0 | 1705 | |
michael@0 | 1706 | v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); |
michael@0 | 1707 | v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); |
michael@0 | 1708 | v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); |
michael@0 | 1709 | v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); |
michael@0 | 1710 | v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); |
michael@0 | 1711 | v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); |
michael@0 | 1712 | v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); |
michael@0 | 1713 | v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); |
michael@0 | 1714 | v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); |
michael@0 | 1715 | v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); |
michael@0 | 1716 | v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); |
michael@0 | 1717 | v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); |
michael@0 | 1718 | v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); |
michael@0 | 1719 | v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); |
michael@0 | 1720 | v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); |
michael@0 | 1721 | v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); |
michael@0 | 1722 | |
michael@0 | 1723 | u[0] = _mm_add_epi32(v[0], v[8]); |
michael@0 | 1724 | u[1] = _mm_add_epi32(v[1], v[9]); |
michael@0 | 1725 | u[2] = _mm_add_epi32(v[2], v[10]); |
michael@0 | 1726 | u[3] = _mm_add_epi32(v[3], v[11]); |
michael@0 | 1727 | u[4] = _mm_add_epi32(v[4], v[12]); |
michael@0 | 1728 | u[5] = _mm_add_epi32(v[5], v[13]); |
michael@0 | 1729 | u[6] = _mm_add_epi32(v[6], v[14]); |
michael@0 | 1730 | u[7] = _mm_add_epi32(v[7], v[15]); |
michael@0 | 1731 | u[8] = _mm_sub_epi32(v[0], v[8]); |
michael@0 | 1732 | u[9] = _mm_sub_epi32(v[1], v[9]); |
michael@0 | 1733 | u[10] = _mm_sub_epi32(v[2], v[10]); |
michael@0 | 1734 | u[11] = _mm_sub_epi32(v[3], v[11]); |
michael@0 | 1735 | u[12] = _mm_sub_epi32(v[4], v[12]); |
michael@0 | 1736 | u[13] = _mm_sub_epi32(v[5], v[13]); |
michael@0 | 1737 | u[14] = _mm_sub_epi32(v[6], v[14]); |
michael@0 | 1738 | u[15] = _mm_sub_epi32(v[7], v[15]); |
michael@0 | 1739 | |
michael@0 | 1740 | v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); |
michael@0 | 1741 | v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); |
michael@0 | 1742 | v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); |
michael@0 | 1743 | v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); |
michael@0 | 1744 | v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); |
michael@0 | 1745 | v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); |
michael@0 | 1746 | v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); |
michael@0 | 1747 | v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); |
michael@0 | 1748 | v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); |
michael@0 | 1749 | v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); |
michael@0 | 1750 | v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); |
michael@0 | 1751 | v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); |
michael@0 | 1752 | v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); |
michael@0 | 1753 | v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); |
michael@0 | 1754 | v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); |
michael@0 | 1755 | v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); |
michael@0 | 1756 | |
michael@0 | 1757 | u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
michael@0 | 1758 | u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
michael@0 | 1759 | u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
michael@0 | 1760 | u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
michael@0 | 1761 | u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); |
michael@0 | 1762 | u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); |
michael@0 | 1763 | u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); |
michael@0 | 1764 | u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); |
michael@0 | 1765 | u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); |
michael@0 | 1766 | u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); |
michael@0 | 1767 | u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); |
michael@0 | 1768 | u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); |
michael@0 | 1769 | u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); |
michael@0 | 1770 | u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); |
michael@0 | 1771 | u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); |
michael@0 | 1772 | u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); |
michael@0 | 1773 | |
michael@0 | 1774 | x[0] = _mm_add_epi16(s[0], s[4]); |
michael@0 | 1775 | x[1] = _mm_add_epi16(s[1], s[5]); |
michael@0 | 1776 | x[2] = _mm_add_epi16(s[2], s[6]); |
michael@0 | 1777 | x[3] = _mm_add_epi16(s[3], s[7]); |
michael@0 | 1778 | x[4] = _mm_sub_epi16(s[0], s[4]); |
michael@0 | 1779 | x[5] = _mm_sub_epi16(s[1], s[5]); |
michael@0 | 1780 | x[6] = _mm_sub_epi16(s[2], s[6]); |
michael@0 | 1781 | x[7] = _mm_sub_epi16(s[3], s[7]); |
michael@0 | 1782 | x[8] = _mm_packs_epi32(u[0], u[1]); |
michael@0 | 1783 | x[9] = _mm_packs_epi32(u[2], u[3]); |
michael@0 | 1784 | x[10] = _mm_packs_epi32(u[4], u[5]); |
michael@0 | 1785 | x[11] = _mm_packs_epi32(u[6], u[7]); |
michael@0 | 1786 | x[12] = _mm_packs_epi32(u[8], u[9]); |
michael@0 | 1787 | x[13] = _mm_packs_epi32(u[10], u[11]); |
michael@0 | 1788 | x[14] = _mm_packs_epi32(u[12], u[13]); |
michael@0 | 1789 | x[15] = _mm_packs_epi32(u[14], u[15]); |
michael@0 | 1790 | |
michael@0 | 1791 | // stage 3 |
michael@0 | 1792 | u[0] = _mm_unpacklo_epi16(x[4], x[5]); |
michael@0 | 1793 | u[1] = _mm_unpackhi_epi16(x[4], x[5]); |
michael@0 | 1794 | u[2] = _mm_unpacklo_epi16(x[6], x[7]); |
michael@0 | 1795 | u[3] = _mm_unpackhi_epi16(x[6], x[7]); |
michael@0 | 1796 | u[4] = _mm_unpacklo_epi16(x[12], x[13]); |
michael@0 | 1797 | u[5] = _mm_unpackhi_epi16(x[12], x[13]); |
michael@0 | 1798 | u[6] = _mm_unpacklo_epi16(x[14], x[15]); |
michael@0 | 1799 | u[7] = _mm_unpackhi_epi16(x[14], x[15]); |
michael@0 | 1800 | |
michael@0 | 1801 | v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); |
michael@0 | 1802 | v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); |
michael@0 | 1803 | v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); |
michael@0 | 1804 | v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); |
michael@0 | 1805 | v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); |
michael@0 | 1806 | v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); |
michael@0 | 1807 | v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); |
michael@0 | 1808 | v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); |
michael@0 | 1809 | v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); |
michael@0 | 1810 | v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); |
michael@0 | 1811 | v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); |
michael@0 | 1812 | v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); |
michael@0 | 1813 | v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); |
michael@0 | 1814 | v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); |
michael@0 | 1815 | v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); |
michael@0 | 1816 | v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); |
michael@0 | 1817 | |
michael@0 | 1818 | u[0] = _mm_add_epi32(v[0], v[4]); |
michael@0 | 1819 | u[1] = _mm_add_epi32(v[1], v[5]); |
michael@0 | 1820 | u[2] = _mm_add_epi32(v[2], v[6]); |
michael@0 | 1821 | u[3] = _mm_add_epi32(v[3], v[7]); |
michael@0 | 1822 | u[4] = _mm_sub_epi32(v[0], v[4]); |
michael@0 | 1823 | u[5] = _mm_sub_epi32(v[1], v[5]); |
michael@0 | 1824 | u[6] = _mm_sub_epi32(v[2], v[6]); |
michael@0 | 1825 | u[7] = _mm_sub_epi32(v[3], v[7]); |
michael@0 | 1826 | u[8] = _mm_add_epi32(v[8], v[12]); |
michael@0 | 1827 | u[9] = _mm_add_epi32(v[9], v[13]); |
michael@0 | 1828 | u[10] = _mm_add_epi32(v[10], v[14]); |
michael@0 | 1829 | u[11] = _mm_add_epi32(v[11], v[15]); |
michael@0 | 1830 | u[12] = _mm_sub_epi32(v[8], v[12]); |
michael@0 | 1831 | u[13] = _mm_sub_epi32(v[9], v[13]); |
michael@0 | 1832 | u[14] = _mm_sub_epi32(v[10], v[14]); |
michael@0 | 1833 | u[15] = _mm_sub_epi32(v[11], v[15]); |
michael@0 | 1834 | |
michael@0 | 1835 | u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); |
michael@0 | 1836 | u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); |
michael@0 | 1837 | u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); |
michael@0 | 1838 | u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); |
michael@0 | 1839 | u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); |
michael@0 | 1840 | u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); |
michael@0 | 1841 | u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); |
michael@0 | 1842 | u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); |
michael@0 | 1843 | u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); |
michael@0 | 1844 | u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); |
michael@0 | 1845 | u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); |
michael@0 | 1846 | u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); |
michael@0 | 1847 | u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); |
michael@0 | 1848 | u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); |
michael@0 | 1849 | u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); |
michael@0 | 1850 | u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); |
michael@0 | 1851 | |
michael@0 | 1852 | v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
michael@0 | 1853 | v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
michael@0 | 1854 | v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
michael@0 | 1855 | v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
michael@0 | 1856 | v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); |
michael@0 | 1857 | v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); |
michael@0 | 1858 | v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); |
michael@0 | 1859 | v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); |
michael@0 | 1860 | v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); |
michael@0 | 1861 | v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); |
michael@0 | 1862 | v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); |
michael@0 | 1863 | v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); |
michael@0 | 1864 | v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); |
michael@0 | 1865 | v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); |
michael@0 | 1866 | v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); |
michael@0 | 1867 | v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); |
michael@0 | 1868 | |
michael@0 | 1869 | s[0] = _mm_add_epi16(x[0], x[2]); |
michael@0 | 1870 | s[1] = _mm_add_epi16(x[1], x[3]); |
michael@0 | 1871 | s[2] = _mm_sub_epi16(x[0], x[2]); |
michael@0 | 1872 | s[3] = _mm_sub_epi16(x[1], x[3]); |
michael@0 | 1873 | s[4] = _mm_packs_epi32(v[0], v[1]); |
michael@0 | 1874 | s[5] = _mm_packs_epi32(v[2], v[3]); |
michael@0 | 1875 | s[6] = _mm_packs_epi32(v[4], v[5]); |
michael@0 | 1876 | s[7] = _mm_packs_epi32(v[6], v[7]); |
michael@0 | 1877 | s[8] = _mm_add_epi16(x[8], x[10]); |
michael@0 | 1878 | s[9] = _mm_add_epi16(x[9], x[11]); |
michael@0 | 1879 | s[10] = _mm_sub_epi16(x[8], x[10]); |
michael@0 | 1880 | s[11] = _mm_sub_epi16(x[9], x[11]); |
michael@0 | 1881 | s[12] = _mm_packs_epi32(v[8], v[9]); |
michael@0 | 1882 | s[13] = _mm_packs_epi32(v[10], v[11]); |
michael@0 | 1883 | s[14] = _mm_packs_epi32(v[12], v[13]); |
michael@0 | 1884 | s[15] = _mm_packs_epi32(v[14], v[15]); |
michael@0 | 1885 | |
michael@0 | 1886 | // stage 4 |
michael@0 | 1887 | u[0] = _mm_unpacklo_epi16(s[2], s[3]); |
michael@0 | 1888 | u[1] = _mm_unpackhi_epi16(s[2], s[3]); |
michael@0 | 1889 | u[2] = _mm_unpacklo_epi16(s[6], s[7]); |
michael@0 | 1890 | u[3] = _mm_unpackhi_epi16(s[6], s[7]); |
michael@0 | 1891 | u[4] = _mm_unpacklo_epi16(s[10], s[11]); |
michael@0 | 1892 | u[5] = _mm_unpackhi_epi16(s[10], s[11]); |
michael@0 | 1893 | u[6] = _mm_unpacklo_epi16(s[14], s[15]); |
michael@0 | 1894 | u[7] = _mm_unpackhi_epi16(s[14], s[15]); |
michael@0 | 1895 | |
michael@0 | 1896 | v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); |
michael@0 | 1897 | v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); |
michael@0 | 1898 | v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); |
michael@0 | 1899 | v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); |
michael@0 | 1900 | v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); |
michael@0 | 1901 | v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); |
michael@0 | 1902 | v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); |
michael@0 | 1903 | v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); |
michael@0 | 1904 | v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); |
michael@0 | 1905 | v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); |
michael@0 | 1906 | v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); |
michael@0 | 1907 | v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); |
michael@0 | 1908 | v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); |
michael@0 | 1909 | v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); |
michael@0 | 1910 | v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); |
michael@0 | 1911 | v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); |
michael@0 | 1912 | |
michael@0 | 1913 | u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); |
michael@0 | 1914 | u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); |
michael@0 | 1915 | u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); |
michael@0 | 1916 | u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); |
michael@0 | 1917 | u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); |
michael@0 | 1918 | u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); |
michael@0 | 1919 | u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); |
michael@0 | 1920 | u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); |
michael@0 | 1921 | u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); |
michael@0 | 1922 | u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); |
michael@0 | 1923 | u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); |
michael@0 | 1924 | u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); |
michael@0 | 1925 | u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); |
michael@0 | 1926 | u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); |
michael@0 | 1927 | u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); |
michael@0 | 1928 | u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); |
michael@0 | 1929 | |
michael@0 | 1930 | v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
michael@0 | 1931 | v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
michael@0 | 1932 | v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
michael@0 | 1933 | v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
michael@0 | 1934 | v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); |
michael@0 | 1935 | v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); |
michael@0 | 1936 | v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); |
michael@0 | 1937 | v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); |
michael@0 | 1938 | v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); |
michael@0 | 1939 | v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); |
michael@0 | 1940 | v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); |
michael@0 | 1941 | v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); |
michael@0 | 1942 | v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); |
michael@0 | 1943 | v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); |
michael@0 | 1944 | v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); |
michael@0 | 1945 | v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); |
michael@0 | 1946 | |
michael@0 | 1947 | in[0] = s[0]; |
michael@0 | 1948 | in[1] = _mm_sub_epi16(kZero, s[8]); |
michael@0 | 1949 | in[2] = s[12]; |
michael@0 | 1950 | in[3] = _mm_sub_epi16(kZero, s[4]); |
michael@0 | 1951 | in[4] = _mm_packs_epi32(v[4], v[5]); |
michael@0 | 1952 | in[5] = _mm_packs_epi32(v[12], v[13]); |
michael@0 | 1953 | in[6] = _mm_packs_epi32(v[8], v[9]); |
michael@0 | 1954 | in[7] = _mm_packs_epi32(v[0], v[1]); |
michael@0 | 1955 | in[8] = _mm_packs_epi32(v[2], v[3]); |
michael@0 | 1956 | in[9] = _mm_packs_epi32(v[10], v[11]); |
michael@0 | 1957 | in[10] = _mm_packs_epi32(v[14], v[15]); |
michael@0 | 1958 | in[11] = _mm_packs_epi32(v[6], v[7]); |
michael@0 | 1959 | in[12] = s[5]; |
michael@0 | 1960 | in[13] = _mm_sub_epi16(kZero, s[13]); |
michael@0 | 1961 | in[14] = s[9]; |
michael@0 | 1962 | in[15] = _mm_sub_epi16(kZero, s[1]); |
michael@0 | 1963 | } |
michael@0 | 1964 | |
michael@0 | 1965 | static void idct16_1d_8col(__m128i *in) { |
michael@0 | 1966 | const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
michael@0 | 1967 | const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); |
michael@0 | 1968 | const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
michael@0 | 1969 | const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); |
michael@0 | 1970 | const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
michael@0 | 1971 | const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); |
michael@0 | 1972 | const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
michael@0 | 1973 | const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); |
michael@0 | 1974 | const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
michael@0 | 1975 | const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); |
michael@0 | 1976 | const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); |
michael@0 | 1977 | const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); |
michael@0 | 1978 | const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
michael@0 | 1979 | const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
michael@0 | 1980 | const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
michael@0 | 1981 | const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
michael@0 | 1982 | const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
michael@0 | 1983 | const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
michael@0 | 1984 | const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
michael@0 | 1985 | const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
michael@0 | 1986 | const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
michael@0 | 1987 | __m128i v[16], u[16], s[16], t[16]; |
michael@0 | 1988 | |
michael@0 | 1989 | // stage 1 |
michael@0 | 1990 | s[0] = in[0]; |
michael@0 | 1991 | s[1] = in[8]; |
michael@0 | 1992 | s[2] = in[4]; |
michael@0 | 1993 | s[3] = in[12]; |
michael@0 | 1994 | s[4] = in[2]; |
michael@0 | 1995 | s[5] = in[10]; |
michael@0 | 1996 | s[6] = in[6]; |
michael@0 | 1997 | s[7] = in[14]; |
michael@0 | 1998 | s[8] = in[1]; |
michael@0 | 1999 | s[9] = in[9]; |
michael@0 | 2000 | s[10] = in[5]; |
michael@0 | 2001 | s[11] = in[13]; |
michael@0 | 2002 | s[12] = in[3]; |
michael@0 | 2003 | s[13] = in[11]; |
michael@0 | 2004 | s[14] = in[7]; |
michael@0 | 2005 | s[15] = in[15]; |
michael@0 | 2006 | |
michael@0 | 2007 | // stage 2 |
michael@0 | 2008 | u[0] = _mm_unpacklo_epi16(s[8], s[15]); |
michael@0 | 2009 | u[1] = _mm_unpackhi_epi16(s[8], s[15]); |
michael@0 | 2010 | u[2] = _mm_unpacklo_epi16(s[9], s[14]); |
michael@0 | 2011 | u[3] = _mm_unpackhi_epi16(s[9], s[14]); |
michael@0 | 2012 | u[4] = _mm_unpacklo_epi16(s[10], s[13]); |
michael@0 | 2013 | u[5] = _mm_unpackhi_epi16(s[10], s[13]); |
michael@0 | 2014 | u[6] = _mm_unpacklo_epi16(s[11], s[12]); |
michael@0 | 2015 | u[7] = _mm_unpackhi_epi16(s[11], s[12]); |
michael@0 | 2016 | |
michael@0 | 2017 | v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); |
michael@0 | 2018 | v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); |
michael@0 | 2019 | v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); |
michael@0 | 2020 | v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); |
michael@0 | 2021 | v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); |
michael@0 | 2022 | v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); |
michael@0 | 2023 | v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); |
michael@0 | 2024 | v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); |
michael@0 | 2025 | v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); |
michael@0 | 2026 | v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); |
michael@0 | 2027 | v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); |
michael@0 | 2028 | v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); |
michael@0 | 2029 | v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); |
michael@0 | 2030 | v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); |
michael@0 | 2031 | v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); |
michael@0 | 2032 | v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); |
michael@0 | 2033 | |
michael@0 | 2034 | u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); |
michael@0 | 2035 | u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); |
michael@0 | 2036 | u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); |
michael@0 | 2037 | u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); |
michael@0 | 2038 | u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); |
michael@0 | 2039 | u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); |
michael@0 | 2040 | u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); |
michael@0 | 2041 | u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); |
michael@0 | 2042 | u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); |
michael@0 | 2043 | u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); |
michael@0 | 2044 | u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); |
michael@0 | 2045 | u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); |
michael@0 | 2046 | u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); |
michael@0 | 2047 | u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); |
michael@0 | 2048 | u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); |
michael@0 | 2049 | u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); |
michael@0 | 2050 | |
michael@0 | 2051 | u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
michael@0 | 2052 | u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
michael@0 | 2053 | u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
michael@0 | 2054 | u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
michael@0 | 2055 | u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); |
michael@0 | 2056 | u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); |
michael@0 | 2057 | u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); |
michael@0 | 2058 | u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); |
michael@0 | 2059 | u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); |
michael@0 | 2060 | u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); |
michael@0 | 2061 | u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); |
michael@0 | 2062 | u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); |
michael@0 | 2063 | u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); |
michael@0 | 2064 | u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); |
michael@0 | 2065 | u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); |
michael@0 | 2066 | u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); |
michael@0 | 2067 | |
michael@0 | 2068 | s[8] = _mm_packs_epi32(u[0], u[1]); |
michael@0 | 2069 | s[15] = _mm_packs_epi32(u[2], u[3]); |
michael@0 | 2070 | s[9] = _mm_packs_epi32(u[4], u[5]); |
michael@0 | 2071 | s[14] = _mm_packs_epi32(u[6], u[7]); |
michael@0 | 2072 | s[10] = _mm_packs_epi32(u[8], u[9]); |
michael@0 | 2073 | s[13] = _mm_packs_epi32(u[10], u[11]); |
michael@0 | 2074 | s[11] = _mm_packs_epi32(u[12], u[13]); |
michael@0 | 2075 | s[12] = _mm_packs_epi32(u[14], u[15]); |
michael@0 | 2076 | |
michael@0 | 2077 | // stage 3 |
michael@0 | 2078 | t[0] = s[0]; |
michael@0 | 2079 | t[1] = s[1]; |
michael@0 | 2080 | t[2] = s[2]; |
michael@0 | 2081 | t[3] = s[3]; |
michael@0 | 2082 | u[0] = _mm_unpacklo_epi16(s[4], s[7]); |
michael@0 | 2083 | u[1] = _mm_unpackhi_epi16(s[4], s[7]); |
michael@0 | 2084 | u[2] = _mm_unpacklo_epi16(s[5], s[6]); |
michael@0 | 2085 | u[3] = _mm_unpackhi_epi16(s[5], s[6]); |
michael@0 | 2086 | |
michael@0 | 2087 | v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); |
michael@0 | 2088 | v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); |
michael@0 | 2089 | v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); |
michael@0 | 2090 | v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); |
michael@0 | 2091 | v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); |
michael@0 | 2092 | v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); |
michael@0 | 2093 | v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); |
michael@0 | 2094 | v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); |
michael@0 | 2095 | |
michael@0 | 2096 | u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); |
michael@0 | 2097 | u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); |
michael@0 | 2098 | u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); |
michael@0 | 2099 | u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); |
michael@0 | 2100 | u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); |
michael@0 | 2101 | u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); |
michael@0 | 2102 | u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); |
michael@0 | 2103 | u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); |
michael@0 | 2104 | |
michael@0 | 2105 | u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
michael@0 | 2106 | u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
michael@0 | 2107 | u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
michael@0 | 2108 | u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
michael@0 | 2109 | u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); |
michael@0 | 2110 | u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); |
michael@0 | 2111 | u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); |
michael@0 | 2112 | u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); |
michael@0 | 2113 | |
michael@0 | 2114 | t[4] = _mm_packs_epi32(u[0], u[1]); |
michael@0 | 2115 | t[7] = _mm_packs_epi32(u[2], u[3]); |
michael@0 | 2116 | t[5] = _mm_packs_epi32(u[4], u[5]); |
michael@0 | 2117 | t[6] = _mm_packs_epi32(u[6], u[7]); |
michael@0 | 2118 | t[8] = _mm_add_epi16(s[8], s[9]); |
michael@0 | 2119 | t[9] = _mm_sub_epi16(s[8], s[9]); |
michael@0 | 2120 | t[10] = _mm_sub_epi16(s[11], s[10]); |
michael@0 | 2121 | t[11] = _mm_add_epi16(s[10], s[11]); |
michael@0 | 2122 | t[12] = _mm_add_epi16(s[12], s[13]); |
michael@0 | 2123 | t[13] = _mm_sub_epi16(s[12], s[13]); |
michael@0 | 2124 | t[14] = _mm_sub_epi16(s[15], s[14]); |
michael@0 | 2125 | t[15] = _mm_add_epi16(s[14], s[15]); |
michael@0 | 2126 | |
michael@0 | 2127 | // stage 4 |
michael@0 | 2128 | u[0] = _mm_unpacklo_epi16(t[0], t[1]); |
michael@0 | 2129 | u[1] = _mm_unpackhi_epi16(t[0], t[1]); |
michael@0 | 2130 | u[2] = _mm_unpacklo_epi16(t[2], t[3]); |
michael@0 | 2131 | u[3] = _mm_unpackhi_epi16(t[2], t[3]); |
michael@0 | 2132 | u[4] = _mm_unpacklo_epi16(t[9], t[14]); |
michael@0 | 2133 | u[5] = _mm_unpackhi_epi16(t[9], t[14]); |
michael@0 | 2134 | u[6] = _mm_unpacklo_epi16(t[10], t[13]); |
michael@0 | 2135 | u[7] = _mm_unpackhi_epi16(t[10], t[13]); |
michael@0 | 2136 | |
michael@0 | 2137 | v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); |
michael@0 | 2138 | v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); |
michael@0 | 2139 | v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); |
michael@0 | 2140 | v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); |
michael@0 | 2141 | v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); |
michael@0 | 2142 | v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); |
michael@0 | 2143 | v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); |
michael@0 | 2144 | v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); |
michael@0 | 2145 | v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); |
michael@0 | 2146 | v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); |
michael@0 | 2147 | v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); |
michael@0 | 2148 | v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); |
michael@0 | 2149 | v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); |
michael@0 | 2150 | v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); |
michael@0 | 2151 | v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); |
michael@0 | 2152 | v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); |
michael@0 | 2153 | |
michael@0 | 2154 | u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); |
michael@0 | 2155 | u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); |
michael@0 | 2156 | u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); |
michael@0 | 2157 | u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); |
michael@0 | 2158 | u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); |
michael@0 | 2159 | u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); |
michael@0 | 2160 | u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); |
michael@0 | 2161 | u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); |
michael@0 | 2162 | u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); |
michael@0 | 2163 | u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); |
michael@0 | 2164 | u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); |
michael@0 | 2165 | u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); |
michael@0 | 2166 | u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); |
michael@0 | 2167 | u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); |
michael@0 | 2168 | u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); |
michael@0 | 2169 | u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); |
michael@0 | 2170 | |
michael@0 | 2171 | u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
michael@0 | 2172 | u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
michael@0 | 2173 | u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
michael@0 | 2174 | u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
michael@0 | 2175 | u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); |
michael@0 | 2176 | u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); |
michael@0 | 2177 | u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); |
michael@0 | 2178 | u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); |
michael@0 | 2179 | u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); |
michael@0 | 2180 | u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); |
michael@0 | 2181 | u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); |
michael@0 | 2182 | u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); |
michael@0 | 2183 | u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); |
michael@0 | 2184 | u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); |
michael@0 | 2185 | u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); |
michael@0 | 2186 | u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); |
michael@0 | 2187 | |
michael@0 | 2188 | s[0] = _mm_packs_epi32(u[0], u[1]); |
michael@0 | 2189 | s[1] = _mm_packs_epi32(u[2], u[3]); |
michael@0 | 2190 | s[2] = _mm_packs_epi32(u[4], u[5]); |
michael@0 | 2191 | s[3] = _mm_packs_epi32(u[6], u[7]); |
michael@0 | 2192 | s[4] = _mm_add_epi16(t[4], t[5]); |
michael@0 | 2193 | s[5] = _mm_sub_epi16(t[4], t[5]); |
michael@0 | 2194 | s[6] = _mm_sub_epi16(t[7], t[6]); |
michael@0 | 2195 | s[7] = _mm_add_epi16(t[6], t[7]); |
michael@0 | 2196 | s[8] = t[8]; |
michael@0 | 2197 | s[15] = t[15]; |
michael@0 | 2198 | s[9] = _mm_packs_epi32(u[8], u[9]); |
michael@0 | 2199 | s[14] = _mm_packs_epi32(u[10], u[11]); |
michael@0 | 2200 | s[10] = _mm_packs_epi32(u[12], u[13]); |
michael@0 | 2201 | s[13] = _mm_packs_epi32(u[14], u[15]); |
michael@0 | 2202 | s[11] = t[11]; |
michael@0 | 2203 | s[12] = t[12]; |
michael@0 | 2204 | |
michael@0 | 2205 | // stage 5 |
michael@0 | 2206 | t[0] = _mm_add_epi16(s[0], s[3]); |
michael@0 | 2207 | t[1] = _mm_add_epi16(s[1], s[2]); |
michael@0 | 2208 | t[2] = _mm_sub_epi16(s[1], s[2]); |
michael@0 | 2209 | t[3] = _mm_sub_epi16(s[0], s[3]); |
michael@0 | 2210 | t[4] = s[4]; |
michael@0 | 2211 | t[7] = s[7]; |
michael@0 | 2212 | |
michael@0 | 2213 | u[0] = _mm_unpacklo_epi16(s[5], s[6]); |
michael@0 | 2214 | u[1] = _mm_unpackhi_epi16(s[5], s[6]); |
michael@0 | 2215 | v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); |
michael@0 | 2216 | v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); |
michael@0 | 2217 | v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); |
michael@0 | 2218 | v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); |
michael@0 | 2219 | u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); |
michael@0 | 2220 | u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); |
michael@0 | 2221 | u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); |
michael@0 | 2222 | u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); |
michael@0 | 2223 | u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
michael@0 | 2224 | u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
michael@0 | 2225 | u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
michael@0 | 2226 | u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
michael@0 | 2227 | t[5] = _mm_packs_epi32(u[0], u[1]); |
michael@0 | 2228 | t[6] = _mm_packs_epi32(u[2], u[3]); |
michael@0 | 2229 | |
michael@0 | 2230 | t[8] = _mm_add_epi16(s[8], s[11]); |
michael@0 | 2231 | t[9] = _mm_add_epi16(s[9], s[10]); |
michael@0 | 2232 | t[10] = _mm_sub_epi16(s[9], s[10]); |
michael@0 | 2233 | t[11] = _mm_sub_epi16(s[8], s[11]); |
michael@0 | 2234 | t[12] = _mm_sub_epi16(s[15], s[12]); |
michael@0 | 2235 | t[13] = _mm_sub_epi16(s[14], s[13]); |
michael@0 | 2236 | t[14] = _mm_add_epi16(s[13], s[14]); |
michael@0 | 2237 | t[15] = _mm_add_epi16(s[12], s[15]); |
michael@0 | 2238 | |
michael@0 | 2239 | // stage 6 |
michael@0 | 2240 | s[0] = _mm_add_epi16(t[0], t[7]); |
michael@0 | 2241 | s[1] = _mm_add_epi16(t[1], t[6]); |
michael@0 | 2242 | s[2] = _mm_add_epi16(t[2], t[5]); |
michael@0 | 2243 | s[3] = _mm_add_epi16(t[3], t[4]); |
michael@0 | 2244 | s[4] = _mm_sub_epi16(t[3], t[4]); |
michael@0 | 2245 | s[5] = _mm_sub_epi16(t[2], t[5]); |
michael@0 | 2246 | s[6] = _mm_sub_epi16(t[1], t[6]); |
michael@0 | 2247 | s[7] = _mm_sub_epi16(t[0], t[7]); |
michael@0 | 2248 | s[8] = t[8]; |
michael@0 | 2249 | s[9] = t[9]; |
michael@0 | 2250 | |
michael@0 | 2251 | u[0] = _mm_unpacklo_epi16(t[10], t[13]); |
michael@0 | 2252 | u[1] = _mm_unpackhi_epi16(t[10], t[13]); |
michael@0 | 2253 | u[2] = _mm_unpacklo_epi16(t[11], t[12]); |
michael@0 | 2254 | u[3] = _mm_unpackhi_epi16(t[11], t[12]); |
michael@0 | 2255 | |
michael@0 | 2256 | v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); |
michael@0 | 2257 | v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); |
michael@0 | 2258 | v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); |
michael@0 | 2259 | v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); |
michael@0 | 2260 | v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); |
michael@0 | 2261 | v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); |
michael@0 | 2262 | v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); |
michael@0 | 2263 | v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); |
michael@0 | 2264 | |
michael@0 | 2265 | u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); |
michael@0 | 2266 | u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); |
michael@0 | 2267 | u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); |
michael@0 | 2268 | u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); |
michael@0 | 2269 | u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); |
michael@0 | 2270 | u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); |
michael@0 | 2271 | u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); |
michael@0 | 2272 | u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); |
michael@0 | 2273 | |
michael@0 | 2274 | u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
michael@0 | 2275 | u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
michael@0 | 2276 | u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
michael@0 | 2277 | u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
michael@0 | 2278 | u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); |
michael@0 | 2279 | u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); |
michael@0 | 2280 | u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); |
michael@0 | 2281 | u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); |
michael@0 | 2282 | |
michael@0 | 2283 | s[10] = _mm_packs_epi32(u[0], u[1]); |
michael@0 | 2284 | s[13] = _mm_packs_epi32(u[2], u[3]); |
michael@0 | 2285 | s[11] = _mm_packs_epi32(u[4], u[5]); |
michael@0 | 2286 | s[12] = _mm_packs_epi32(u[6], u[7]); |
michael@0 | 2287 | s[14] = t[14]; |
michael@0 | 2288 | s[15] = t[15]; |
michael@0 | 2289 | |
michael@0 | 2290 | // stage 7 |
michael@0 | 2291 | in[0] = _mm_add_epi16(s[0], s[15]); |
michael@0 | 2292 | in[1] = _mm_add_epi16(s[1], s[14]); |
michael@0 | 2293 | in[2] = _mm_add_epi16(s[2], s[13]); |
michael@0 | 2294 | in[3] = _mm_add_epi16(s[3], s[12]); |
michael@0 | 2295 | in[4] = _mm_add_epi16(s[4], s[11]); |
michael@0 | 2296 | in[5] = _mm_add_epi16(s[5], s[10]); |
michael@0 | 2297 | in[6] = _mm_add_epi16(s[6], s[9]); |
michael@0 | 2298 | in[7] = _mm_add_epi16(s[7], s[8]); |
michael@0 | 2299 | in[8] = _mm_sub_epi16(s[7], s[8]); |
michael@0 | 2300 | in[9] = _mm_sub_epi16(s[6], s[9]); |
michael@0 | 2301 | in[10] = _mm_sub_epi16(s[5], s[10]); |
michael@0 | 2302 | in[11] = _mm_sub_epi16(s[4], s[11]); |
michael@0 | 2303 | in[12] = _mm_sub_epi16(s[3], s[12]); |
michael@0 | 2304 | in[13] = _mm_sub_epi16(s[2], s[13]); |
michael@0 | 2305 | in[14] = _mm_sub_epi16(s[1], s[14]); |
michael@0 | 2306 | in[15] = _mm_sub_epi16(s[0], s[15]); |
michael@0 | 2307 | } |
michael@0 | 2308 | |
michael@0 | 2309 | static void idct16_1d_sse2(__m128i *in0, __m128i *in1) { |
michael@0 | 2310 | array_transpose_16x16(in0, in1); |
michael@0 | 2311 | idct16_1d_8col(in0); |
michael@0 | 2312 | idct16_1d_8col(in1); |
michael@0 | 2313 | } |
michael@0 | 2314 | |
michael@0 | 2315 | static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) { |
michael@0 | 2316 | array_transpose_16x16(in0, in1); |
michael@0 | 2317 | iadst16_1d_8col(in0); |
michael@0 | 2318 | iadst16_1d_8col(in1); |
michael@0 | 2319 | } |
michael@0 | 2320 | |
michael@0 | 2321 | static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { |
michael@0 | 2322 | in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); |
michael@0 | 2323 | in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); |
michael@0 | 2324 | in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); |
michael@0 | 2325 | in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); |
michael@0 | 2326 | in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); |
michael@0 | 2327 | in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); |
michael@0 | 2328 | in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); |
michael@0 | 2329 | in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); |
michael@0 | 2330 | |
michael@0 | 2331 | in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); |
michael@0 | 2332 | in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); |
michael@0 | 2333 | in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); |
michael@0 | 2334 | in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); |
michael@0 | 2335 | in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); |
michael@0 | 2336 | in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); |
michael@0 | 2337 | in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); |
michael@0 | 2338 | in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); |
michael@0 | 2339 | } |
michael@0 | 2340 | |
michael@0 | 2341 | static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { |
michael@0 | 2342 | const __m128i final_rounding = _mm_set1_epi16(1<<5); |
michael@0 | 2343 | const __m128i zero = _mm_setzero_si128(); |
michael@0 | 2344 | // Final rounding and shift |
michael@0 | 2345 | in[0] = _mm_adds_epi16(in[0], final_rounding); |
michael@0 | 2346 | in[1] = _mm_adds_epi16(in[1], final_rounding); |
michael@0 | 2347 | in[2] = _mm_adds_epi16(in[2], final_rounding); |
michael@0 | 2348 | in[3] = _mm_adds_epi16(in[3], final_rounding); |
michael@0 | 2349 | in[4] = _mm_adds_epi16(in[4], final_rounding); |
michael@0 | 2350 | in[5] = _mm_adds_epi16(in[5], final_rounding); |
michael@0 | 2351 | in[6] = _mm_adds_epi16(in[6], final_rounding); |
michael@0 | 2352 | in[7] = _mm_adds_epi16(in[7], final_rounding); |
michael@0 | 2353 | in[8] = _mm_adds_epi16(in[8], final_rounding); |
michael@0 | 2354 | in[9] = _mm_adds_epi16(in[9], final_rounding); |
michael@0 | 2355 | in[10] = _mm_adds_epi16(in[10], final_rounding); |
michael@0 | 2356 | in[11] = _mm_adds_epi16(in[11], final_rounding); |
michael@0 | 2357 | in[12] = _mm_adds_epi16(in[12], final_rounding); |
michael@0 | 2358 | in[13] = _mm_adds_epi16(in[13], final_rounding); |
michael@0 | 2359 | in[14] = _mm_adds_epi16(in[14], final_rounding); |
michael@0 | 2360 | in[15] = _mm_adds_epi16(in[15], final_rounding); |
michael@0 | 2361 | |
michael@0 | 2362 | in[0] = _mm_srai_epi16(in[0], 6); |
michael@0 | 2363 | in[1] = _mm_srai_epi16(in[1], 6); |
michael@0 | 2364 | in[2] = _mm_srai_epi16(in[2], 6); |
michael@0 | 2365 | in[3] = _mm_srai_epi16(in[3], 6); |
michael@0 | 2366 | in[4] = _mm_srai_epi16(in[4], 6); |
michael@0 | 2367 | in[5] = _mm_srai_epi16(in[5], 6); |
michael@0 | 2368 | in[6] = _mm_srai_epi16(in[6], 6); |
michael@0 | 2369 | in[7] = _mm_srai_epi16(in[7], 6); |
michael@0 | 2370 | in[8] = _mm_srai_epi16(in[8], 6); |
michael@0 | 2371 | in[9] = _mm_srai_epi16(in[9], 6); |
michael@0 | 2372 | in[10] = _mm_srai_epi16(in[10], 6); |
michael@0 | 2373 | in[11] = _mm_srai_epi16(in[11], 6); |
michael@0 | 2374 | in[12] = _mm_srai_epi16(in[12], 6); |
michael@0 | 2375 | in[13] = _mm_srai_epi16(in[13], 6); |
michael@0 | 2376 | in[14] = _mm_srai_epi16(in[14], 6); |
michael@0 | 2377 | in[15] = _mm_srai_epi16(in[15], 6); |
michael@0 | 2378 | |
michael@0 | 2379 | RECON_AND_STORE(dest, in[0]); |
michael@0 | 2380 | RECON_AND_STORE(dest, in[1]); |
michael@0 | 2381 | RECON_AND_STORE(dest, in[2]); |
michael@0 | 2382 | RECON_AND_STORE(dest, in[3]); |
michael@0 | 2383 | RECON_AND_STORE(dest, in[4]); |
michael@0 | 2384 | RECON_AND_STORE(dest, in[5]); |
michael@0 | 2385 | RECON_AND_STORE(dest, in[6]); |
michael@0 | 2386 | RECON_AND_STORE(dest, in[7]); |
michael@0 | 2387 | RECON_AND_STORE(dest, in[8]); |
michael@0 | 2388 | RECON_AND_STORE(dest, in[9]); |
michael@0 | 2389 | RECON_AND_STORE(dest, in[10]); |
michael@0 | 2390 | RECON_AND_STORE(dest, in[11]); |
michael@0 | 2391 | RECON_AND_STORE(dest, in[12]); |
michael@0 | 2392 | RECON_AND_STORE(dest, in[13]); |
michael@0 | 2393 | RECON_AND_STORE(dest, in[14]); |
michael@0 | 2394 | RECON_AND_STORE(dest, in[15]); |
michael@0 | 2395 | } |
michael@0 | 2396 | |
michael@0 | 2397 | void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
michael@0 | 2398 | int tx_type) { |
michael@0 | 2399 | __m128i in0[16], in1[16]; |
michael@0 | 2400 | |
michael@0 | 2401 | load_buffer_8x16(input, in0); |
michael@0 | 2402 | input += 8; |
michael@0 | 2403 | load_buffer_8x16(input, in1); |
michael@0 | 2404 | |
michael@0 | 2405 | switch (tx_type) { |
michael@0 | 2406 | case 0: // DCT_DCT |
michael@0 | 2407 | idct16_1d_sse2(in0, in1); |
michael@0 | 2408 | idct16_1d_sse2(in0, in1); |
michael@0 | 2409 | break; |
michael@0 | 2410 | case 1: // ADST_DCT |
michael@0 | 2411 | idct16_1d_sse2(in0, in1); |
michael@0 | 2412 | iadst16_1d_sse2(in0, in1); |
michael@0 | 2413 | break; |
michael@0 | 2414 | case 2: // DCT_ADST |
michael@0 | 2415 | iadst16_1d_sse2(in0, in1); |
michael@0 | 2416 | idct16_1d_sse2(in0, in1); |
michael@0 | 2417 | break; |
michael@0 | 2418 | case 3: // ADST_ADST |
michael@0 | 2419 | iadst16_1d_sse2(in0, in1); |
michael@0 | 2420 | iadst16_1d_sse2(in0, in1); |
michael@0 | 2421 | break; |
michael@0 | 2422 | default: |
michael@0 | 2423 | assert(0); |
michael@0 | 2424 | break; |
michael@0 | 2425 | } |
michael@0 | 2426 | |
michael@0 | 2427 | write_buffer_8x16(dest, in0, stride); |
michael@0 | 2428 | dest += 8; |
michael@0 | 2429 | write_buffer_8x16(dest, in1, stride); |
michael@0 | 2430 | } |
michael@0 | 2431 | |
michael@0 | 2432 | void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, |
michael@0 | 2433 | int stride) { |
michael@0 | 2434 | const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
michael@0 | 2435 | const __m128i final_rounding = _mm_set1_epi16(1<<5); |
michael@0 | 2436 | const __m128i zero = _mm_setzero_si128(); |
michael@0 | 2437 | |
michael@0 | 2438 | const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
michael@0 | 2439 | const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); |
michael@0 | 2440 | const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
michael@0 | 2441 | const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); |
michael@0 | 2442 | const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
michael@0 | 2443 | const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); |
michael@0 | 2444 | const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
michael@0 | 2445 | const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); |
michael@0 | 2446 | |
michael@0 | 2447 | const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
michael@0 | 2448 | const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
michael@0 | 2449 | const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); |
michael@0 | 2450 | const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); |
michael@0 | 2451 | |
michael@0 | 2452 | const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
michael@0 | 2453 | const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
michael@0 | 2454 | const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
michael@0 | 2455 | const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
michael@0 | 2456 | const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
michael@0 | 2457 | const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); |
michael@0 | 2458 | const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
michael@0 | 2459 | const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
michael@0 | 2460 | |
michael@0 | 2461 | const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
michael@0 | 2462 | |
michael@0 | 2463 | __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, |
michael@0 | 2464 | in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, |
michael@0 | 2465 | in10 = zero, in11 = zero, in12 = zero, in13 = zero, |
michael@0 | 2466 | in14 = zero, in15 = zero; |
michael@0 | 2467 | __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, |
michael@0 | 2468 | l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, |
michael@0 | 2469 | l12 = zero, l13 = zero, l14 = zero, l15 = zero; |
michael@0 | 2470 | |
michael@0 | 2471 | __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, |
michael@0 | 2472 | stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, |
michael@0 | 2473 | stp1_8_0, stp1_12_0; |
michael@0 | 2474 | __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
michael@0 | 2475 | stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; |
michael@0 | 2476 | __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
michael@0 | 2477 | int i; |
michael@0 | 2478 | // 1-D idct. Load input data. |
michael@0 | 2479 | in0 = _mm_load_si128((const __m128i *)input); |
michael@0 | 2480 | in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
michael@0 | 2481 | in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
michael@0 | 2482 | in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
michael@0 | 2483 | in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
michael@0 | 2484 | in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
michael@0 | 2485 | in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
michael@0 | 2486 | in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
michael@0 | 2487 | |
michael@0 | 2488 | TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3); |
michael@0 | 2489 | TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11); |
michael@0 | 2490 | |
michael@0 | 2491 | // Stage2 |
michael@0 | 2492 | { |
michael@0 | 2493 | const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11); |
michael@0 | 2494 | const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3); |
michael@0 | 2495 | const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9); |
michael@0 | 2496 | const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1); |
michael@0 | 2497 | |
michael@0 | 2498 | tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); |
michael@0 | 2499 | tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); |
michael@0 | 2500 | tmp4 = _mm_madd_epi16(lo_9_7, stg2_2); |
michael@0 | 2501 | tmp6 = _mm_madd_epi16(lo_9_7, stg2_3); |
michael@0 | 2502 | tmp1 = _mm_madd_epi16(lo_5_11, stg2_4); |
michael@0 | 2503 | tmp3 = _mm_madd_epi16(lo_5_11, stg2_5); |
michael@0 | 2504 | tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); |
michael@0 | 2505 | tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); |
michael@0 | 2506 | |
michael@0 | 2507 | tmp0 = _mm_add_epi32(tmp0, rounding); |
michael@0 | 2508 | tmp2 = _mm_add_epi32(tmp2, rounding); |
michael@0 | 2509 | tmp4 = _mm_add_epi32(tmp4, rounding); |
michael@0 | 2510 | tmp6 = _mm_add_epi32(tmp6, rounding); |
michael@0 | 2511 | tmp1 = _mm_add_epi32(tmp1, rounding); |
michael@0 | 2512 | tmp3 = _mm_add_epi32(tmp3, rounding); |
michael@0 | 2513 | tmp5 = _mm_add_epi32(tmp5, rounding); |
michael@0 | 2514 | tmp7 = _mm_add_epi32(tmp7, rounding); |
michael@0 | 2515 | |
michael@0 | 2516 | tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
michael@0 | 2517 | tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
michael@0 | 2518 | tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); |
michael@0 | 2519 | tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); |
michael@0 | 2520 | tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); |
michael@0 | 2521 | tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); |
michael@0 | 2522 | tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); |
michael@0 | 2523 | tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); |
michael@0 | 2524 | |
michael@0 | 2525 | stp2_8 = _mm_packs_epi32(tmp0, zero); |
michael@0 | 2526 | stp2_15 = _mm_packs_epi32(tmp2, zero); |
michael@0 | 2527 | stp2_9 = _mm_packs_epi32(tmp4, zero); |
michael@0 | 2528 | stp2_14 = _mm_packs_epi32(tmp6, zero); |
michael@0 | 2529 | |
michael@0 | 2530 | stp2_10 = _mm_packs_epi32(tmp1, zero); |
michael@0 | 2531 | stp2_13 = _mm_packs_epi32(tmp3, zero); |
michael@0 | 2532 | stp2_11 = _mm_packs_epi32(tmp5, zero); |
michael@0 | 2533 | stp2_12 = _mm_packs_epi32(tmp7, zero); |
michael@0 | 2534 | } |
michael@0 | 2535 | |
michael@0 | 2536 | // Stage3 |
michael@0 | 2537 | { |
michael@0 | 2538 | const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11); |
michael@0 | 2539 | const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3); |
michael@0 | 2540 | |
michael@0 | 2541 | tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); |
michael@0 | 2542 | tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); |
michael@0 | 2543 | tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); |
michael@0 | 2544 | tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); |
michael@0 | 2545 | |
michael@0 | 2546 | tmp0 = _mm_add_epi32(tmp0, rounding); |
michael@0 | 2547 | tmp2 = _mm_add_epi32(tmp2, rounding); |
michael@0 | 2548 | tmp4 = _mm_add_epi32(tmp4, rounding); |
michael@0 | 2549 | tmp6 = _mm_add_epi32(tmp6, rounding); |
michael@0 | 2550 | |
michael@0 | 2551 | tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
michael@0 | 2552 | tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
michael@0 | 2553 | tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); |
michael@0 | 2554 | tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); |
michael@0 | 2555 | |
michael@0 | 2556 | stp1_4 = _mm_packs_epi32(tmp0, zero); |
michael@0 | 2557 | stp1_7 = _mm_packs_epi32(tmp2, zero); |
michael@0 | 2558 | stp1_5 = _mm_packs_epi32(tmp4, zero); |
michael@0 | 2559 | stp1_6 = _mm_packs_epi32(tmp6, zero); |
michael@0 | 2560 | |
michael@0 | 2561 | stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); |
michael@0 | 2562 | stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); |
michael@0 | 2563 | stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); |
michael@0 | 2564 | stp1_11 = _mm_add_epi16(stp2_11, stp2_10); |
michael@0 | 2565 | |
michael@0 | 2566 | stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); |
michael@0 | 2567 | stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); |
michael@0 | 2568 | stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); |
michael@0 | 2569 | stp1_15 = _mm_add_epi16(stp2_15, stp2_14); |
michael@0 | 2570 | } |
michael@0 | 2571 | |
michael@0 | 2572 | // Stage4 |
michael@0 | 2573 | { |
michael@0 | 2574 | const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); |
michael@0 | 2575 | const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10); |
michael@0 | 2576 | const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); |
michael@0 | 2577 | const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); |
michael@0 | 2578 | |
michael@0 | 2579 | tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); |
michael@0 | 2580 | tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); |
michael@0 | 2581 | tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); |
michael@0 | 2582 | tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); |
michael@0 | 2583 | tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); |
michael@0 | 2584 | tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); |
michael@0 | 2585 | tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); |
michael@0 | 2586 | tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); |
michael@0 | 2587 | |
michael@0 | 2588 | tmp0 = _mm_add_epi32(tmp0, rounding); |
michael@0 | 2589 | tmp2 = _mm_add_epi32(tmp2, rounding); |
michael@0 | 2590 | tmp4 = _mm_add_epi32(tmp4, rounding); |
michael@0 | 2591 | tmp6 = _mm_add_epi32(tmp6, rounding); |
michael@0 | 2592 | tmp1 = _mm_add_epi32(tmp1, rounding); |
michael@0 | 2593 | tmp3 = _mm_add_epi32(tmp3, rounding); |
michael@0 | 2594 | tmp5 = _mm_add_epi32(tmp5, rounding); |
michael@0 | 2595 | tmp7 = _mm_add_epi32(tmp7, rounding); |
michael@0 | 2596 | |
michael@0 | 2597 | tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
michael@0 | 2598 | tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
michael@0 | 2599 | tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); |
michael@0 | 2600 | tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); |
michael@0 | 2601 | tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); |
michael@0 | 2602 | tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); |
michael@0 | 2603 | tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); |
michael@0 | 2604 | tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); |
michael@0 | 2605 | |
michael@0 | 2606 | stp2_0 = _mm_packs_epi32(tmp0, zero); |
michael@0 | 2607 | stp2_1 = _mm_packs_epi32(tmp2, zero); |
michael@0 | 2608 | stp2_2 = _mm_packs_epi32(tmp4, zero); |
michael@0 | 2609 | stp2_3 = _mm_packs_epi32(tmp6, zero); |
michael@0 | 2610 | stp2_9 = _mm_packs_epi32(tmp1, zero); |
michael@0 | 2611 | stp2_14 = _mm_packs_epi32(tmp3, zero); |
michael@0 | 2612 | stp2_10 = _mm_packs_epi32(tmp5, zero); |
michael@0 | 2613 | stp2_13 = _mm_packs_epi32(tmp7, zero); |
michael@0 | 2614 | |
michael@0 | 2615 | stp2_4 = _mm_add_epi16(stp1_4, stp1_5); |
michael@0 | 2616 | stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); |
michael@0 | 2617 | stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); |
michael@0 | 2618 | stp2_7 = _mm_add_epi16(stp1_7, stp1_6); |
michael@0 | 2619 | } |
michael@0 | 2620 | |
michael@0 | 2621 | // Stage5 and Stage6 |
michael@0 | 2622 | { |
michael@0 | 2623 | stp1_0 = _mm_add_epi16(stp2_0, stp2_3); |
michael@0 | 2624 | stp1_1 = _mm_add_epi16(stp2_1, stp2_2); |
michael@0 | 2625 | stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); |
michael@0 | 2626 | stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); |
michael@0 | 2627 | |
michael@0 | 2628 | stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); |
michael@0 | 2629 | stp1_9 = _mm_add_epi16(stp2_9, stp2_10); |
michael@0 | 2630 | stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); |
michael@0 | 2631 | stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); |
michael@0 | 2632 | |
michael@0 | 2633 | stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); |
michael@0 | 2634 | stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); |
michael@0 | 2635 | stp1_14 = _mm_add_epi16(stp2_14, stp2_13); |
michael@0 | 2636 | stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); |
michael@0 | 2637 | } |
michael@0 | 2638 | |
michael@0 | 2639 | // Stage6 |
michael@0 | 2640 | { |
michael@0 | 2641 | const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); |
michael@0 | 2642 | const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); |
michael@0 | 2643 | const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); |
michael@0 | 2644 | |
michael@0 | 2645 | tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); |
michael@0 | 2646 | tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); |
michael@0 | 2647 | tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); |
michael@0 | 2648 | tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); |
michael@0 | 2649 | tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); |
michael@0 | 2650 | tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); |
michael@0 | 2651 | |
michael@0 | 2652 | tmp1 = _mm_add_epi32(tmp1, rounding); |
michael@0 | 2653 | tmp3 = _mm_add_epi32(tmp3, rounding); |
michael@0 | 2654 | tmp0 = _mm_add_epi32(tmp0, rounding); |
michael@0 | 2655 | tmp2 = _mm_add_epi32(tmp2, rounding); |
michael@0 | 2656 | tmp4 = _mm_add_epi32(tmp4, rounding); |
michael@0 | 2657 | tmp6 = _mm_add_epi32(tmp6, rounding); |
michael@0 | 2658 | |
michael@0 | 2659 | tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); |
michael@0 | 2660 | tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); |
michael@0 | 2661 | tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
michael@0 | 2662 | tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
michael@0 | 2663 | tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); |
michael@0 | 2664 | tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); |
michael@0 | 2665 | |
michael@0 | 2666 | stp1_5 = _mm_packs_epi32(tmp1, zero); |
michael@0 | 2667 | stp1_6 = _mm_packs_epi32(tmp3, zero); |
michael@0 | 2668 | stp2_10 = _mm_packs_epi32(tmp0, zero); |
michael@0 | 2669 | stp2_13 = _mm_packs_epi32(tmp2, zero); |
michael@0 | 2670 | stp2_11 = _mm_packs_epi32(tmp4, zero); |
michael@0 | 2671 | stp2_12 = _mm_packs_epi32(tmp6, zero); |
michael@0 | 2672 | |
michael@0 | 2673 | stp2_0 = _mm_add_epi16(stp1_0, stp2_7); |
michael@0 | 2674 | stp2_1 = _mm_add_epi16(stp1_1, stp1_6); |
michael@0 | 2675 | stp2_2 = _mm_add_epi16(stp1_2, stp1_5); |
michael@0 | 2676 | stp2_3 = _mm_add_epi16(stp1_3, stp2_4); |
michael@0 | 2677 | stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); |
michael@0 | 2678 | stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); |
michael@0 | 2679 | stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); |
michael@0 | 2680 | stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); |
michael@0 | 2681 | } |
michael@0 | 2682 | |
michael@0 | 2683 | // Stage7. Left 8x16 only. |
michael@0 | 2684 | l0 = _mm_add_epi16(stp2_0, stp1_15); |
michael@0 | 2685 | l1 = _mm_add_epi16(stp2_1, stp1_14); |
michael@0 | 2686 | l2 = _mm_add_epi16(stp2_2, stp2_13); |
michael@0 | 2687 | l3 = _mm_add_epi16(stp2_3, stp2_12); |
michael@0 | 2688 | l4 = _mm_add_epi16(stp2_4, stp2_11); |
michael@0 | 2689 | l5 = _mm_add_epi16(stp2_5, stp2_10); |
michael@0 | 2690 | l6 = _mm_add_epi16(stp2_6, stp1_9); |
michael@0 | 2691 | l7 = _mm_add_epi16(stp2_7, stp1_8); |
michael@0 | 2692 | l8 = _mm_sub_epi16(stp2_7, stp1_8); |
michael@0 | 2693 | l9 = _mm_sub_epi16(stp2_6, stp1_9); |
michael@0 | 2694 | l10 = _mm_sub_epi16(stp2_5, stp2_10); |
michael@0 | 2695 | l11 = _mm_sub_epi16(stp2_4, stp2_11); |
michael@0 | 2696 | l12 = _mm_sub_epi16(stp2_3, stp2_12); |
michael@0 | 2697 | l13 = _mm_sub_epi16(stp2_2, stp2_13); |
michael@0 | 2698 | l14 = _mm_sub_epi16(stp2_1, stp1_14); |
michael@0 | 2699 | l15 = _mm_sub_epi16(stp2_0, stp1_15); |
michael@0 | 2700 | |
michael@0 | 2701 | // 2-D idct. We do 2 8x16 blocks. |
michael@0 | 2702 | for (i = 0; i < 2; i++) { |
michael@0 | 2703 | if (i == 0) |
michael@0 | 2704 | TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, |
michael@0 | 2705 | in5, in6, in7); |
michael@0 | 2706 | |
michael@0 | 2707 | if (i == 1) |
michael@0 | 2708 | TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, |
michael@0 | 2709 | in4, in5, in6, in7); |
michael@0 | 2710 | |
michael@0 | 2711 | in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero; |
michael@0 | 2712 | |
michael@0 | 2713 | IDCT16_1D |
michael@0 | 2714 | |
michael@0 | 2715 | // Stage7 |
michael@0 | 2716 | in0 = _mm_add_epi16(stp2_0, stp1_15); |
michael@0 | 2717 | in1 = _mm_add_epi16(stp2_1, stp1_14); |
michael@0 | 2718 | in2 = _mm_add_epi16(stp2_2, stp2_13); |
michael@0 | 2719 | in3 = _mm_add_epi16(stp2_3, stp2_12); |
michael@0 | 2720 | in4 = _mm_add_epi16(stp2_4, stp2_11); |
michael@0 | 2721 | in5 = _mm_add_epi16(stp2_5, stp2_10); |
michael@0 | 2722 | in6 = _mm_add_epi16(stp2_6, stp1_9); |
michael@0 | 2723 | in7 = _mm_add_epi16(stp2_7, stp1_8); |
michael@0 | 2724 | in8 = _mm_sub_epi16(stp2_7, stp1_8); |
michael@0 | 2725 | in9 = _mm_sub_epi16(stp2_6, stp1_9); |
michael@0 | 2726 | in10 = _mm_sub_epi16(stp2_5, stp2_10); |
michael@0 | 2727 | in11 = _mm_sub_epi16(stp2_4, stp2_11); |
michael@0 | 2728 | in12 = _mm_sub_epi16(stp2_3, stp2_12); |
michael@0 | 2729 | in13 = _mm_sub_epi16(stp2_2, stp2_13); |
michael@0 | 2730 | in14 = _mm_sub_epi16(stp2_1, stp1_14); |
michael@0 | 2731 | in15 = _mm_sub_epi16(stp2_0, stp1_15); |
michael@0 | 2732 | |
michael@0 | 2733 | // Final rounding and shift |
michael@0 | 2734 | in0 = _mm_adds_epi16(in0, final_rounding); |
michael@0 | 2735 | in1 = _mm_adds_epi16(in1, final_rounding); |
michael@0 | 2736 | in2 = _mm_adds_epi16(in2, final_rounding); |
michael@0 | 2737 | in3 = _mm_adds_epi16(in3, final_rounding); |
michael@0 | 2738 | in4 = _mm_adds_epi16(in4, final_rounding); |
michael@0 | 2739 | in5 = _mm_adds_epi16(in5, final_rounding); |
michael@0 | 2740 | in6 = _mm_adds_epi16(in6, final_rounding); |
michael@0 | 2741 | in7 = _mm_adds_epi16(in7, final_rounding); |
michael@0 | 2742 | in8 = _mm_adds_epi16(in8, final_rounding); |
michael@0 | 2743 | in9 = _mm_adds_epi16(in9, final_rounding); |
michael@0 | 2744 | in10 = _mm_adds_epi16(in10, final_rounding); |
michael@0 | 2745 | in11 = _mm_adds_epi16(in11, final_rounding); |
michael@0 | 2746 | in12 = _mm_adds_epi16(in12, final_rounding); |
michael@0 | 2747 | in13 = _mm_adds_epi16(in13, final_rounding); |
michael@0 | 2748 | in14 = _mm_adds_epi16(in14, final_rounding); |
michael@0 | 2749 | in15 = _mm_adds_epi16(in15, final_rounding); |
michael@0 | 2750 | |
michael@0 | 2751 | in0 = _mm_srai_epi16(in0, 6); |
michael@0 | 2752 | in1 = _mm_srai_epi16(in1, 6); |
michael@0 | 2753 | in2 = _mm_srai_epi16(in2, 6); |
michael@0 | 2754 | in3 = _mm_srai_epi16(in3, 6); |
michael@0 | 2755 | in4 = _mm_srai_epi16(in4, 6); |
michael@0 | 2756 | in5 = _mm_srai_epi16(in5, 6); |
michael@0 | 2757 | in6 = _mm_srai_epi16(in6, 6); |
michael@0 | 2758 | in7 = _mm_srai_epi16(in7, 6); |
michael@0 | 2759 | in8 = _mm_srai_epi16(in8, 6); |
michael@0 | 2760 | in9 = _mm_srai_epi16(in9, 6); |
michael@0 | 2761 | in10 = _mm_srai_epi16(in10, 6); |
michael@0 | 2762 | in11 = _mm_srai_epi16(in11, 6); |
michael@0 | 2763 | in12 = _mm_srai_epi16(in12, 6); |
michael@0 | 2764 | in13 = _mm_srai_epi16(in13, 6); |
michael@0 | 2765 | in14 = _mm_srai_epi16(in14, 6); |
michael@0 | 2766 | in15 = _mm_srai_epi16(in15, 6); |
michael@0 | 2767 | |
michael@0 | 2768 | RECON_AND_STORE(dest, in0); |
michael@0 | 2769 | RECON_AND_STORE(dest, in1); |
michael@0 | 2770 | RECON_AND_STORE(dest, in2); |
michael@0 | 2771 | RECON_AND_STORE(dest, in3); |
michael@0 | 2772 | RECON_AND_STORE(dest, in4); |
michael@0 | 2773 | RECON_AND_STORE(dest, in5); |
michael@0 | 2774 | RECON_AND_STORE(dest, in6); |
michael@0 | 2775 | RECON_AND_STORE(dest, in7); |
michael@0 | 2776 | RECON_AND_STORE(dest, in8); |
michael@0 | 2777 | RECON_AND_STORE(dest, in9); |
michael@0 | 2778 | RECON_AND_STORE(dest, in10); |
michael@0 | 2779 | RECON_AND_STORE(dest, in11); |
michael@0 | 2780 | RECON_AND_STORE(dest, in12); |
michael@0 | 2781 | RECON_AND_STORE(dest, in13); |
michael@0 | 2782 | RECON_AND_STORE(dest, in14); |
michael@0 | 2783 | RECON_AND_STORE(dest, in15); |
michael@0 | 2784 | |
michael@0 | 2785 | dest += 8 - (stride * 16); |
michael@0 | 2786 | } |
michael@0 | 2787 | } |
michael@0 | 2788 | |
michael@0 | 2789 | #define LOAD_DQCOEFF(reg, input) \ |
michael@0 | 2790 | { \ |
michael@0 | 2791 | reg = _mm_load_si128((const __m128i *) input); \ |
michael@0 | 2792 | input += 8; \ |
michael@0 | 2793 | } \ |
michael@0 | 2794 | |
michael@0 | 2795 | #define IDCT32_1D \ |
michael@0 | 2796 | /* Stage1 */ \ |
michael@0 | 2797 | { \ |
michael@0 | 2798 | const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \ |
michael@0 | 2799 | const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \ |
michael@0 | 2800 | const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \ |
michael@0 | 2801 | const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \ |
michael@0 | 2802 | \ |
michael@0 | 2803 | const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \ |
michael@0 | 2804 | const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \ |
michael@0 | 2805 | const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \ |
michael@0 | 2806 | const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \ |
michael@0 | 2807 | \ |
michael@0 | 2808 | const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \ |
michael@0 | 2809 | const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \ |
michael@0 | 2810 | const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \ |
michael@0 | 2811 | const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \ |
michael@0 | 2812 | \ |
michael@0 | 2813 | const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \ |
michael@0 | 2814 | const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \ |
michael@0 | 2815 | const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \ |
michael@0 | 2816 | const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \ |
michael@0 | 2817 | \ |
michael@0 | 2818 | MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ |
michael@0 | 2819 | stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ |
michael@0 | 2820 | stp1_17, stp1_30) \ |
michael@0 | 2821 | MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ |
michael@0 | 2822 | stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ |
michael@0 | 2823 | stp1_19, stp1_28) \ |
michael@0 | 2824 | MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ |
michael@0 | 2825 | stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ |
michael@0 | 2826 | stp1_21, stp1_26) \ |
michael@0 | 2827 | MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ |
michael@0 | 2828 | stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ |
michael@0 | 2829 | stp1_23, stp1_24) \ |
michael@0 | 2830 | } \ |
michael@0 | 2831 | \ |
michael@0 | 2832 | /* Stage2 */ \ |
michael@0 | 2833 | { \ |
michael@0 | 2834 | const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \ |
michael@0 | 2835 | const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \ |
michael@0 | 2836 | const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \ |
michael@0 | 2837 | const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \ |
michael@0 | 2838 | \ |
michael@0 | 2839 | const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \ |
michael@0 | 2840 | const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \ |
michael@0 | 2841 | const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \ |
michael@0 | 2842 | const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \ |
michael@0 | 2843 | \ |
michael@0 | 2844 | MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ |
michael@0 | 2845 | stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ |
michael@0 | 2846 | stp2_14) \ |
michael@0 | 2847 | MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ |
michael@0 | 2848 | stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ |
michael@0 | 2849 | stp2_11, stp2_12) \ |
michael@0 | 2850 | \ |
michael@0 | 2851 | stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ |
michael@0 | 2852 | stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ |
michael@0 | 2853 | stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ |
michael@0 | 2854 | stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ |
michael@0 | 2855 | \ |
michael@0 | 2856 | stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ |
michael@0 | 2857 | stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ |
michael@0 | 2858 | stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ |
michael@0 | 2859 | stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ |
michael@0 | 2860 | \ |
michael@0 | 2861 | stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ |
michael@0 | 2862 | stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ |
michael@0 | 2863 | stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ |
michael@0 | 2864 | stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ |
michael@0 | 2865 | \ |
michael@0 | 2866 | stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ |
michael@0 | 2867 | stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ |
michael@0 | 2868 | stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ |
michael@0 | 2869 | stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ |
michael@0 | 2870 | } \ |
michael@0 | 2871 | \ |
michael@0 | 2872 | /* Stage3 */ \ |
michael@0 | 2873 | { \ |
michael@0 | 2874 | const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \ |
michael@0 | 2875 | const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \ |
michael@0 | 2876 | const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \ |
michael@0 | 2877 | const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \ |
michael@0 | 2878 | \ |
michael@0 | 2879 | const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ |
michael@0 | 2880 | const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ |
michael@0 | 2881 | const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ |
michael@0 | 2882 | const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ |
michael@0 | 2883 | \ |
michael@0 | 2884 | const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ |
michael@0 | 2885 | const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ |
michael@0 | 2886 | const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ |
michael@0 | 2887 | const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ |
michael@0 | 2888 | \ |
michael@0 | 2889 | MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ |
michael@0 | 2890 | stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ |
michael@0 | 2891 | stp1_6) \ |
michael@0 | 2892 | \ |
michael@0 | 2893 | stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ |
michael@0 | 2894 | stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ |
michael@0 | 2895 | stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ |
michael@0 | 2896 | stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ |
michael@0 | 2897 | stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ |
michael@0 | 2898 | stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ |
michael@0 | 2899 | stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ |
michael@0 | 2900 | stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ |
michael@0 | 2901 | \ |
michael@0 | 2902 | MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ |
michael@0 | 2903 | stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ |
michael@0 | 2904 | stp1_18, stp1_29) \ |
michael@0 | 2905 | MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ |
michael@0 | 2906 | stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ |
michael@0 | 2907 | stp1_22, stp1_25) \ |
michael@0 | 2908 | \ |
michael@0 | 2909 | stp1_16 = stp2_16; \ |
michael@0 | 2910 | stp1_31 = stp2_31; \ |
michael@0 | 2911 | stp1_19 = stp2_19; \ |
michael@0 | 2912 | stp1_20 = stp2_20; \ |
michael@0 | 2913 | stp1_23 = stp2_23; \ |
michael@0 | 2914 | stp1_24 = stp2_24; \ |
michael@0 | 2915 | stp1_27 = stp2_27; \ |
michael@0 | 2916 | stp1_28 = stp2_28; \ |
michael@0 | 2917 | } \ |
michael@0 | 2918 | \ |
michael@0 | 2919 | /* Stage4 */ \ |
michael@0 | 2920 | { \ |
michael@0 | 2921 | const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \ |
michael@0 | 2922 | const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \ |
michael@0 | 2923 | const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \ |
michael@0 | 2924 | const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \ |
michael@0 | 2925 | \ |
michael@0 | 2926 | const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ |
michael@0 | 2927 | const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ |
michael@0 | 2928 | const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ |
michael@0 | 2929 | const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ |
michael@0 | 2930 | \ |
michael@0 | 2931 | MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ |
michael@0 | 2932 | stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ |
michael@0 | 2933 | stp2_2, stp2_3) \ |
michael@0 | 2934 | \ |
michael@0 | 2935 | stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ |
michael@0 | 2936 | stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ |
michael@0 | 2937 | stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ |
michael@0 | 2938 | stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ |
michael@0 | 2939 | \ |
michael@0 | 2940 | MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ |
michael@0 | 2941 | stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ |
michael@0 | 2942 | stp2_10, stp2_13) \ |
michael@0 | 2943 | \ |
michael@0 | 2944 | stp2_8 = stp1_8; \ |
michael@0 | 2945 | stp2_15 = stp1_15; \ |
michael@0 | 2946 | stp2_11 = stp1_11; \ |
michael@0 | 2947 | stp2_12 = stp1_12; \ |
michael@0 | 2948 | \ |
michael@0 | 2949 | stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ |
michael@0 | 2950 | stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ |
michael@0 | 2951 | stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ |
michael@0 | 2952 | stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ |
michael@0 | 2953 | stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ |
michael@0 | 2954 | stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ |
michael@0 | 2955 | stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ |
michael@0 | 2956 | stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ |
michael@0 | 2957 | \ |
michael@0 | 2958 | stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ |
michael@0 | 2959 | stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ |
michael@0 | 2960 | stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ |
michael@0 | 2961 | stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ |
michael@0 | 2962 | stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ |
michael@0 | 2963 | stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ |
michael@0 | 2964 | stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ |
michael@0 | 2965 | stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ |
michael@0 | 2966 | } \ |
michael@0 | 2967 | \ |
michael@0 | 2968 | /* Stage5 */ \ |
michael@0 | 2969 | { \ |
michael@0 | 2970 | const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ |
michael@0 | 2971 | const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ |
michael@0 | 2972 | const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ |
michael@0 | 2973 | const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ |
michael@0 | 2974 | \ |
michael@0 | 2975 | const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ |
michael@0 | 2976 | const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ |
michael@0 | 2977 | const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ |
michael@0 | 2978 | const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ |
michael@0 | 2979 | \ |
michael@0 | 2980 | const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ |
michael@0 | 2981 | const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ |
michael@0 | 2982 | \ |
michael@0 | 2983 | stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ |
michael@0 | 2984 | stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ |
michael@0 | 2985 | stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ |
michael@0 | 2986 | stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ |
michael@0 | 2987 | \ |
michael@0 | 2988 | tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ |
michael@0 | 2989 | tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ |
michael@0 | 2990 | tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ |
michael@0 | 2991 | tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ |
michael@0 | 2992 | \ |
michael@0 | 2993 | tmp0 = _mm_add_epi32(tmp0, rounding); \ |
michael@0 | 2994 | tmp1 = _mm_add_epi32(tmp1, rounding); \ |
michael@0 | 2995 | tmp2 = _mm_add_epi32(tmp2, rounding); \ |
michael@0 | 2996 | tmp3 = _mm_add_epi32(tmp3, rounding); \ |
michael@0 | 2997 | \ |
michael@0 | 2998 | tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ |
michael@0 | 2999 | tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ |
michael@0 | 3000 | tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ |
michael@0 | 3001 | tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ |
michael@0 | 3002 | \ |
michael@0 | 3003 | stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ |
michael@0 | 3004 | stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ |
michael@0 | 3005 | \ |
michael@0 | 3006 | stp1_4 = stp2_4; \ |
michael@0 | 3007 | stp1_7 = stp2_7; \ |
michael@0 | 3008 | \ |
michael@0 | 3009 | stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ |
michael@0 | 3010 | stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ |
michael@0 | 3011 | stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ |
michael@0 | 3012 | stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ |
michael@0 | 3013 | stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ |
michael@0 | 3014 | stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ |
michael@0 | 3015 | stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ |
michael@0 | 3016 | stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ |
michael@0 | 3017 | \ |
michael@0 | 3018 | stp1_16 = stp2_16; \ |
michael@0 | 3019 | stp1_17 = stp2_17; \ |
michael@0 | 3020 | \ |
michael@0 | 3021 | MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ |
michael@0 | 3022 | stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ |
michael@0 | 3023 | stp1_19, stp1_28) \ |
michael@0 | 3024 | MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ |
michael@0 | 3025 | stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ |
michael@0 | 3026 | stp1_21, stp1_26) \ |
michael@0 | 3027 | \ |
michael@0 | 3028 | stp1_22 = stp2_22; \ |
michael@0 | 3029 | stp1_23 = stp2_23; \ |
michael@0 | 3030 | stp1_24 = stp2_24; \ |
michael@0 | 3031 | stp1_25 = stp2_25; \ |
michael@0 | 3032 | stp1_30 = stp2_30; \ |
michael@0 | 3033 | stp1_31 = stp2_31; \ |
michael@0 | 3034 | } \ |
michael@0 | 3035 | \ |
michael@0 | 3036 | /* Stage6 */ \ |
michael@0 | 3037 | { \ |
michael@0 | 3038 | const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ |
michael@0 | 3039 | const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ |
michael@0 | 3040 | const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ |
michael@0 | 3041 | const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ |
michael@0 | 3042 | \ |
michael@0 | 3043 | stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ |
michael@0 | 3044 | stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ |
michael@0 | 3045 | stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ |
michael@0 | 3046 | stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ |
michael@0 | 3047 | stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ |
michael@0 | 3048 | stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ |
michael@0 | 3049 | stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ |
michael@0 | 3050 | stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ |
michael@0 | 3051 | \ |
michael@0 | 3052 | stp2_8 = stp1_8; \ |
michael@0 | 3053 | stp2_9 = stp1_9; \ |
michael@0 | 3054 | stp2_14 = stp1_14; \ |
michael@0 | 3055 | stp2_15 = stp1_15; \ |
michael@0 | 3056 | \ |
michael@0 | 3057 | MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ |
michael@0 | 3058 | stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ |
michael@0 | 3059 | stp2_13, stp2_11, stp2_12) \ |
michael@0 | 3060 | \ |
michael@0 | 3061 | stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ |
michael@0 | 3062 | stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ |
michael@0 | 3063 | stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ |
michael@0 | 3064 | stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ |
michael@0 | 3065 | stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ |
michael@0 | 3066 | stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ |
michael@0 | 3067 | stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ |
michael@0 | 3068 | stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ |
michael@0 | 3069 | \ |
michael@0 | 3070 | stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ |
michael@0 | 3071 | stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ |
michael@0 | 3072 | stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ |
michael@0 | 3073 | stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ |
michael@0 | 3074 | stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ |
michael@0 | 3075 | stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ |
michael@0 | 3076 | stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ |
michael@0 | 3077 | stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ |
michael@0 | 3078 | } \ |
michael@0 | 3079 | \ |
michael@0 | 3080 | /* Stage7 */ \ |
michael@0 | 3081 | { \ |
michael@0 | 3082 | const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ |
michael@0 | 3083 | const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ |
michael@0 | 3084 | const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ |
michael@0 | 3085 | const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ |
michael@0 | 3086 | \ |
michael@0 | 3087 | const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ |
michael@0 | 3088 | const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ |
michael@0 | 3089 | const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ |
michael@0 | 3090 | const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ |
michael@0 | 3091 | \ |
michael@0 | 3092 | stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ |
michael@0 | 3093 | stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ |
michael@0 | 3094 | stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ |
michael@0 | 3095 | stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ |
michael@0 | 3096 | stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ |
michael@0 | 3097 | stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ |
michael@0 | 3098 | stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ |
michael@0 | 3099 | stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ |
michael@0 | 3100 | stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ |
michael@0 | 3101 | stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ |
michael@0 | 3102 | stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ |
michael@0 | 3103 | stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ |
michael@0 | 3104 | stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ |
michael@0 | 3105 | stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ |
michael@0 | 3106 | stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ |
michael@0 | 3107 | stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ |
michael@0 | 3108 | \ |
michael@0 | 3109 | stp1_16 = stp2_16; \ |
michael@0 | 3110 | stp1_17 = stp2_17; \ |
michael@0 | 3111 | stp1_18 = stp2_18; \ |
michael@0 | 3112 | stp1_19 = stp2_19; \ |
michael@0 | 3113 | \ |
michael@0 | 3114 | MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ |
michael@0 | 3115 | stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ |
michael@0 | 3116 | stp1_21, stp1_26) \ |
michael@0 | 3117 | MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ |
michael@0 | 3118 | stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ |
michael@0 | 3119 | stp1_23, stp1_24) \ |
michael@0 | 3120 | \ |
michael@0 | 3121 | stp1_28 = stp2_28; \ |
michael@0 | 3122 | stp1_29 = stp2_29; \ |
michael@0 | 3123 | stp1_30 = stp2_30; \ |
michael@0 | 3124 | stp1_31 = stp2_31; \ |
michael@0 | 3125 | } |
michael@0 | 3126 | |
michael@0 | 3127 | // Only upper-left 8x8 has non-zero coeff |
michael@0 | 3128 | void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, |
michael@0 | 3129 | int stride) { |
michael@0 | 3130 | const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
michael@0 | 3131 | const __m128i final_rounding = _mm_set1_epi16(1<<5); |
michael@0 | 3132 | |
michael@0 | 3133 | // idct constants for each stage |
michael@0 | 3134 | const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
michael@0 | 3135 | const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); |
michael@0 | 3136 | const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); |
michael@0 | 3137 | const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); |
michael@0 | 3138 | const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); |
michael@0 | 3139 | const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); |
michael@0 | 3140 | const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); |
michael@0 | 3141 | const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); |
michael@0 | 3142 | const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); |
michael@0 | 3143 | const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); |
michael@0 | 3144 | const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); |
michael@0 | 3145 | const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); |
michael@0 | 3146 | const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); |
michael@0 | 3147 | const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); |
michael@0 | 3148 | const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); |
michael@0 | 3149 | const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); |
michael@0 | 3150 | |
michael@0 | 3151 | const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
michael@0 | 3152 | const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); |
michael@0 | 3153 | const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
michael@0 | 3154 | const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); |
michael@0 | 3155 | const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
michael@0 | 3156 | const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); |
michael@0 | 3157 | const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
michael@0 | 3158 | const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); |
michael@0 | 3159 | |
michael@0 | 3160 | const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
michael@0 | 3161 | const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
michael@0 | 3162 | const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); |
michael@0 | 3163 | const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); |
michael@0 | 3164 | const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
michael@0 | 3165 | const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); |
michael@0 | 3166 | const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); |
michael@0 | 3167 | const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
michael@0 | 3168 | const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); |
michael@0 | 3169 | const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); |
michael@0 | 3170 | |
michael@0 | 3171 | const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
michael@0 | 3172 | const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
michael@0 | 3173 | const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
michael@0 | 3174 | const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
michael@0 | 3175 | const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
michael@0 | 3176 | const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); |
michael@0 | 3177 | const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
michael@0 | 3178 | |
michael@0 | 3179 | const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
michael@0 | 3180 | |
michael@0 | 3181 | __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, |
michael@0 | 3182 | in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, |
michael@0 | 3183 | in24, in25, in26, in27, in28, in29, in30, in31; |
michael@0 | 3184 | __m128i col[128]; |
michael@0 | 3185 | __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, |
michael@0 | 3186 | stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, |
michael@0 | 3187 | stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, |
michael@0 | 3188 | stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, |
michael@0 | 3189 | stp1_30, stp1_31; |
michael@0 | 3190 | __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
michael@0 | 3191 | stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, |
michael@0 | 3192 | stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, |
michael@0 | 3193 | stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, |
michael@0 | 3194 | stp2_30, stp2_31; |
michael@0 | 3195 | __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
michael@0 | 3196 | int i, j, i32; |
michael@0 | 3197 | |
michael@0 | 3198 | // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. |
michael@0 | 3199 | for (i = 0; i < 8; i++) { |
michael@0 | 3200 | i32 = (i << 5); |
michael@0 | 3201 | if (i == 0) { |
michael@0 | 3202 | // First 1-D idct: first 8 rows |
michael@0 | 3203 | // Load input data. |
michael@0 | 3204 | LOAD_DQCOEFF(in0, input); |
michael@0 | 3205 | LOAD_DQCOEFF(in8, input); |
michael@0 | 3206 | LOAD_DQCOEFF(in16, input); |
michael@0 | 3207 | LOAD_DQCOEFF(in24, input); |
michael@0 | 3208 | LOAD_DQCOEFF(in1, input); |
michael@0 | 3209 | LOAD_DQCOEFF(in9, input); |
michael@0 | 3210 | LOAD_DQCOEFF(in17, input); |
michael@0 | 3211 | LOAD_DQCOEFF(in25, input); |
michael@0 | 3212 | LOAD_DQCOEFF(in2, input); |
michael@0 | 3213 | LOAD_DQCOEFF(in10, input); |
michael@0 | 3214 | LOAD_DQCOEFF(in18, input); |
michael@0 | 3215 | LOAD_DQCOEFF(in26, input); |
michael@0 | 3216 | LOAD_DQCOEFF(in3, input); |
michael@0 | 3217 | LOAD_DQCOEFF(in11, input); |
michael@0 | 3218 | LOAD_DQCOEFF(in19, input); |
michael@0 | 3219 | LOAD_DQCOEFF(in27, input); |
michael@0 | 3220 | |
michael@0 | 3221 | LOAD_DQCOEFF(in4, input); |
michael@0 | 3222 | LOAD_DQCOEFF(in12, input); |
michael@0 | 3223 | LOAD_DQCOEFF(in20, input); |
michael@0 | 3224 | LOAD_DQCOEFF(in28, input); |
michael@0 | 3225 | LOAD_DQCOEFF(in5, input); |
michael@0 | 3226 | LOAD_DQCOEFF(in13, input); |
michael@0 | 3227 | LOAD_DQCOEFF(in21, input); |
michael@0 | 3228 | LOAD_DQCOEFF(in29, input); |
michael@0 | 3229 | LOAD_DQCOEFF(in6, input); |
michael@0 | 3230 | LOAD_DQCOEFF(in14, input); |
michael@0 | 3231 | LOAD_DQCOEFF(in22, input); |
michael@0 | 3232 | LOAD_DQCOEFF(in30, input); |
michael@0 | 3233 | LOAD_DQCOEFF(in7, input); |
michael@0 | 3234 | LOAD_DQCOEFF(in15, input); |
michael@0 | 3235 | LOAD_DQCOEFF(in23, input); |
michael@0 | 3236 | LOAD_DQCOEFF(in31, input); |
michael@0 | 3237 | |
michael@0 | 3238 | // Transpose 32x8 block to 8x32 block |
michael@0 | 3239 | TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, |
michael@0 | 3240 | in4, in5, in6, in7); |
michael@0 | 3241 | TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, |
michael@0 | 3242 | in10, in11, in12, in13, in14, in15); |
michael@0 | 3243 | TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, |
michael@0 | 3244 | in18, in19, in20, in21, in22, in23); |
michael@0 | 3245 | TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, |
michael@0 | 3246 | in26, in27, in28, in29, in30, in31); |
michael@0 | 3247 | } else if (i < 4) { |
michael@0 | 3248 | // First 1-D idct: next 24 zero-coeff rows |
michael@0 | 3249 | col[i32 + 0] = _mm_setzero_si128(); |
michael@0 | 3250 | col[i32 + 1] = _mm_setzero_si128(); |
michael@0 | 3251 | col[i32 + 2] = _mm_setzero_si128(); |
michael@0 | 3252 | col[i32 + 3] = _mm_setzero_si128(); |
michael@0 | 3253 | col[i32 + 4] = _mm_setzero_si128(); |
michael@0 | 3254 | col[i32 + 5] = _mm_setzero_si128(); |
michael@0 | 3255 | col[i32 + 6] = _mm_setzero_si128(); |
michael@0 | 3256 | col[i32 + 7] = _mm_setzero_si128(); |
michael@0 | 3257 | col[i32 + 8] = _mm_setzero_si128(); |
michael@0 | 3258 | col[i32 + 9] = _mm_setzero_si128(); |
michael@0 | 3259 | col[i32 + 10] = _mm_setzero_si128(); |
michael@0 | 3260 | col[i32 + 11] = _mm_setzero_si128(); |
michael@0 | 3261 | col[i32 + 12] = _mm_setzero_si128(); |
michael@0 | 3262 | col[i32 + 13] = _mm_setzero_si128(); |
michael@0 | 3263 | col[i32 + 14] = _mm_setzero_si128(); |
michael@0 | 3264 | col[i32 + 15] = _mm_setzero_si128(); |
michael@0 | 3265 | col[i32 + 16] = _mm_setzero_si128(); |
michael@0 | 3266 | col[i32 + 17] = _mm_setzero_si128(); |
michael@0 | 3267 | col[i32 + 18] = _mm_setzero_si128(); |
michael@0 | 3268 | col[i32 + 19] = _mm_setzero_si128(); |
michael@0 | 3269 | col[i32 + 20] = _mm_setzero_si128(); |
michael@0 | 3270 | col[i32 + 21] = _mm_setzero_si128(); |
michael@0 | 3271 | col[i32 + 22] = _mm_setzero_si128(); |
michael@0 | 3272 | col[i32 + 23] = _mm_setzero_si128(); |
michael@0 | 3273 | col[i32 + 24] = _mm_setzero_si128(); |
michael@0 | 3274 | col[i32 + 25] = _mm_setzero_si128(); |
michael@0 | 3275 | col[i32 + 26] = _mm_setzero_si128(); |
michael@0 | 3276 | col[i32 + 27] = _mm_setzero_si128(); |
michael@0 | 3277 | col[i32 + 28] = _mm_setzero_si128(); |
michael@0 | 3278 | col[i32 + 29] = _mm_setzero_si128(); |
michael@0 | 3279 | col[i32 + 30] = _mm_setzero_si128(); |
michael@0 | 3280 | col[i32 + 31] = _mm_setzero_si128(); |
michael@0 | 3281 | continue; |
michael@0 | 3282 | } else { |
michael@0 | 3283 | // Second 1-D idct |
michael@0 | 3284 | j = i - 4; |
michael@0 | 3285 | |
michael@0 | 3286 | // Transpose 32x8 block to 8x32 block |
michael@0 | 3287 | TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
michael@0 | 3288 | col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
michael@0 | 3289 | col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, |
michael@0 | 3290 | in5, in6, in7); |
michael@0 | 3291 | j += 4; |
michael@0 | 3292 | TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
michael@0 | 3293 | col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
michael@0 | 3294 | col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, |
michael@0 | 3295 | in11, in12, in13, in14, in15); |
michael@0 | 3296 | j += 4; |
michael@0 | 3297 | TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
michael@0 | 3298 | col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
michael@0 | 3299 | col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, |
michael@0 | 3300 | in19, in20, in21, in22, in23); |
michael@0 | 3301 | j += 4; |
michael@0 | 3302 | TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
michael@0 | 3303 | col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
michael@0 | 3304 | col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, |
michael@0 | 3305 | in28, in29, in30, in31); |
michael@0 | 3306 | } |
michael@0 | 3307 | |
michael@0 | 3308 | IDCT32_1D |
michael@0 | 3309 | |
michael@0 | 3310 | // final stage |
michael@0 | 3311 | if (i < 4) { |
michael@0 | 3312 | // 1_D: Store 32 intermediate results for each 8x32 block. |
michael@0 | 3313 | col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); |
michael@0 | 3314 | col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); |
michael@0 | 3315 | col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); |
michael@0 | 3316 | col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); |
michael@0 | 3317 | col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); |
michael@0 | 3318 | col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); |
michael@0 | 3319 | col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); |
michael@0 | 3320 | col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); |
michael@0 | 3321 | col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); |
michael@0 | 3322 | col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); |
michael@0 | 3323 | col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); |
michael@0 | 3324 | col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); |
michael@0 | 3325 | col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); |
michael@0 | 3326 | col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); |
michael@0 | 3327 | col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); |
michael@0 | 3328 | col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); |
michael@0 | 3329 | col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); |
michael@0 | 3330 | col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); |
michael@0 | 3331 | col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); |
michael@0 | 3332 | col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); |
michael@0 | 3333 | col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); |
michael@0 | 3334 | col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); |
michael@0 | 3335 | col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); |
michael@0 | 3336 | col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); |
michael@0 | 3337 | col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); |
michael@0 | 3338 | col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); |
michael@0 | 3339 | col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); |
michael@0 | 3340 | col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); |
michael@0 | 3341 | col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); |
michael@0 | 3342 | col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); |
michael@0 | 3343 | col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); |
michael@0 | 3344 | col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); |
michael@0 | 3345 | } else { |
michael@0 | 3346 | const __m128i zero = _mm_setzero_si128(); |
michael@0 | 3347 | |
michael@0 | 3348 | // 2_D: Calculate the results and store them to destination. |
michael@0 | 3349 | in0 = _mm_add_epi16(stp1_0, stp1_31); |
michael@0 | 3350 | in1 = _mm_add_epi16(stp1_1, stp1_30); |
michael@0 | 3351 | in2 = _mm_add_epi16(stp1_2, stp1_29); |
michael@0 | 3352 | in3 = _mm_add_epi16(stp1_3, stp1_28); |
michael@0 | 3353 | in4 = _mm_add_epi16(stp1_4, stp1_27); |
michael@0 | 3354 | in5 = _mm_add_epi16(stp1_5, stp1_26); |
michael@0 | 3355 | in6 = _mm_add_epi16(stp1_6, stp1_25); |
michael@0 | 3356 | in7 = _mm_add_epi16(stp1_7, stp1_24); |
michael@0 | 3357 | in8 = _mm_add_epi16(stp1_8, stp1_23); |
michael@0 | 3358 | in9 = _mm_add_epi16(stp1_9, stp1_22); |
michael@0 | 3359 | in10 = _mm_add_epi16(stp1_10, stp1_21); |
michael@0 | 3360 | in11 = _mm_add_epi16(stp1_11, stp1_20); |
michael@0 | 3361 | in12 = _mm_add_epi16(stp1_12, stp1_19); |
michael@0 | 3362 | in13 = _mm_add_epi16(stp1_13, stp1_18); |
michael@0 | 3363 | in14 = _mm_add_epi16(stp1_14, stp1_17); |
michael@0 | 3364 | in15 = _mm_add_epi16(stp1_15, stp1_16); |
michael@0 | 3365 | in16 = _mm_sub_epi16(stp1_15, stp1_16); |
michael@0 | 3366 | in17 = _mm_sub_epi16(stp1_14, stp1_17); |
michael@0 | 3367 | in18 = _mm_sub_epi16(stp1_13, stp1_18); |
michael@0 | 3368 | in19 = _mm_sub_epi16(stp1_12, stp1_19); |
michael@0 | 3369 | in20 = _mm_sub_epi16(stp1_11, stp1_20); |
michael@0 | 3370 | in21 = _mm_sub_epi16(stp1_10, stp1_21); |
michael@0 | 3371 | in22 = _mm_sub_epi16(stp1_9, stp1_22); |
michael@0 | 3372 | in23 = _mm_sub_epi16(stp1_8, stp1_23); |
michael@0 | 3373 | in24 = _mm_sub_epi16(stp1_7, stp1_24); |
michael@0 | 3374 | in25 = _mm_sub_epi16(stp1_6, stp1_25); |
michael@0 | 3375 | in26 = _mm_sub_epi16(stp1_5, stp1_26); |
michael@0 | 3376 | in27 = _mm_sub_epi16(stp1_4, stp1_27); |
michael@0 | 3377 | in28 = _mm_sub_epi16(stp1_3, stp1_28); |
michael@0 | 3378 | in29 = _mm_sub_epi16(stp1_2, stp1_29); |
michael@0 | 3379 | in30 = _mm_sub_epi16(stp1_1, stp1_30); |
michael@0 | 3380 | in31 = _mm_sub_epi16(stp1_0, stp1_31); |
michael@0 | 3381 | |
michael@0 | 3382 | // Final rounding and shift |
michael@0 | 3383 | in0 = _mm_adds_epi16(in0, final_rounding); |
michael@0 | 3384 | in1 = _mm_adds_epi16(in1, final_rounding); |
michael@0 | 3385 | in2 = _mm_adds_epi16(in2, final_rounding); |
michael@0 | 3386 | in3 = _mm_adds_epi16(in3, final_rounding); |
michael@0 | 3387 | in4 = _mm_adds_epi16(in4, final_rounding); |
michael@0 | 3388 | in5 = _mm_adds_epi16(in5, final_rounding); |
michael@0 | 3389 | in6 = _mm_adds_epi16(in6, final_rounding); |
michael@0 | 3390 | in7 = _mm_adds_epi16(in7, final_rounding); |
michael@0 | 3391 | in8 = _mm_adds_epi16(in8, final_rounding); |
michael@0 | 3392 | in9 = _mm_adds_epi16(in9, final_rounding); |
michael@0 | 3393 | in10 = _mm_adds_epi16(in10, final_rounding); |
michael@0 | 3394 | in11 = _mm_adds_epi16(in11, final_rounding); |
michael@0 | 3395 | in12 = _mm_adds_epi16(in12, final_rounding); |
michael@0 | 3396 | in13 = _mm_adds_epi16(in13, final_rounding); |
michael@0 | 3397 | in14 = _mm_adds_epi16(in14, final_rounding); |
michael@0 | 3398 | in15 = _mm_adds_epi16(in15, final_rounding); |
michael@0 | 3399 | in16 = _mm_adds_epi16(in16, final_rounding); |
michael@0 | 3400 | in17 = _mm_adds_epi16(in17, final_rounding); |
michael@0 | 3401 | in18 = _mm_adds_epi16(in18, final_rounding); |
michael@0 | 3402 | in19 = _mm_adds_epi16(in19, final_rounding); |
michael@0 | 3403 | in20 = _mm_adds_epi16(in20, final_rounding); |
michael@0 | 3404 | in21 = _mm_adds_epi16(in21, final_rounding); |
michael@0 | 3405 | in22 = _mm_adds_epi16(in22, final_rounding); |
michael@0 | 3406 | in23 = _mm_adds_epi16(in23, final_rounding); |
michael@0 | 3407 | in24 = _mm_adds_epi16(in24, final_rounding); |
michael@0 | 3408 | in25 = _mm_adds_epi16(in25, final_rounding); |
michael@0 | 3409 | in26 = _mm_adds_epi16(in26, final_rounding); |
michael@0 | 3410 | in27 = _mm_adds_epi16(in27, final_rounding); |
michael@0 | 3411 | in28 = _mm_adds_epi16(in28, final_rounding); |
michael@0 | 3412 | in29 = _mm_adds_epi16(in29, final_rounding); |
michael@0 | 3413 | in30 = _mm_adds_epi16(in30, final_rounding); |
michael@0 | 3414 | in31 = _mm_adds_epi16(in31, final_rounding); |
michael@0 | 3415 | |
michael@0 | 3416 | in0 = _mm_srai_epi16(in0, 6); |
michael@0 | 3417 | in1 = _mm_srai_epi16(in1, 6); |
michael@0 | 3418 | in2 = _mm_srai_epi16(in2, 6); |
michael@0 | 3419 | in3 = _mm_srai_epi16(in3, 6); |
michael@0 | 3420 | in4 = _mm_srai_epi16(in4, 6); |
michael@0 | 3421 | in5 = _mm_srai_epi16(in5, 6); |
michael@0 | 3422 | in6 = _mm_srai_epi16(in6, 6); |
michael@0 | 3423 | in7 = _mm_srai_epi16(in7, 6); |
michael@0 | 3424 | in8 = _mm_srai_epi16(in8, 6); |
michael@0 | 3425 | in9 = _mm_srai_epi16(in9, 6); |
michael@0 | 3426 | in10 = _mm_srai_epi16(in10, 6); |
michael@0 | 3427 | in11 = _mm_srai_epi16(in11, 6); |
michael@0 | 3428 | in12 = _mm_srai_epi16(in12, 6); |
michael@0 | 3429 | in13 = _mm_srai_epi16(in13, 6); |
michael@0 | 3430 | in14 = _mm_srai_epi16(in14, 6); |
michael@0 | 3431 | in15 = _mm_srai_epi16(in15, 6); |
michael@0 | 3432 | in16 = _mm_srai_epi16(in16, 6); |
michael@0 | 3433 | in17 = _mm_srai_epi16(in17, 6); |
michael@0 | 3434 | in18 = _mm_srai_epi16(in18, 6); |
michael@0 | 3435 | in19 = _mm_srai_epi16(in19, 6); |
michael@0 | 3436 | in20 = _mm_srai_epi16(in20, 6); |
michael@0 | 3437 | in21 = _mm_srai_epi16(in21, 6); |
michael@0 | 3438 | in22 = _mm_srai_epi16(in22, 6); |
michael@0 | 3439 | in23 = _mm_srai_epi16(in23, 6); |
michael@0 | 3440 | in24 = _mm_srai_epi16(in24, 6); |
michael@0 | 3441 | in25 = _mm_srai_epi16(in25, 6); |
michael@0 | 3442 | in26 = _mm_srai_epi16(in26, 6); |
michael@0 | 3443 | in27 = _mm_srai_epi16(in27, 6); |
michael@0 | 3444 | in28 = _mm_srai_epi16(in28, 6); |
michael@0 | 3445 | in29 = _mm_srai_epi16(in29, 6); |
michael@0 | 3446 | in30 = _mm_srai_epi16(in30, 6); |
michael@0 | 3447 | in31 = _mm_srai_epi16(in31, 6); |
michael@0 | 3448 | |
michael@0 | 3449 | RECON_AND_STORE(dest, in0); |
michael@0 | 3450 | RECON_AND_STORE(dest, in1); |
michael@0 | 3451 | RECON_AND_STORE(dest, in2); |
michael@0 | 3452 | RECON_AND_STORE(dest, in3); |
michael@0 | 3453 | RECON_AND_STORE(dest, in4); |
michael@0 | 3454 | RECON_AND_STORE(dest, in5); |
michael@0 | 3455 | RECON_AND_STORE(dest, in6); |
michael@0 | 3456 | RECON_AND_STORE(dest, in7); |
michael@0 | 3457 | RECON_AND_STORE(dest, in8); |
michael@0 | 3458 | RECON_AND_STORE(dest, in9); |
michael@0 | 3459 | RECON_AND_STORE(dest, in10); |
michael@0 | 3460 | RECON_AND_STORE(dest, in11); |
michael@0 | 3461 | RECON_AND_STORE(dest, in12); |
michael@0 | 3462 | RECON_AND_STORE(dest, in13); |
michael@0 | 3463 | RECON_AND_STORE(dest, in14); |
michael@0 | 3464 | RECON_AND_STORE(dest, in15); |
michael@0 | 3465 | RECON_AND_STORE(dest, in16); |
michael@0 | 3466 | RECON_AND_STORE(dest, in17); |
michael@0 | 3467 | RECON_AND_STORE(dest, in18); |
michael@0 | 3468 | RECON_AND_STORE(dest, in19); |
michael@0 | 3469 | RECON_AND_STORE(dest, in20); |
michael@0 | 3470 | RECON_AND_STORE(dest, in21); |
michael@0 | 3471 | RECON_AND_STORE(dest, in22); |
michael@0 | 3472 | RECON_AND_STORE(dest, in23); |
michael@0 | 3473 | RECON_AND_STORE(dest, in24); |
michael@0 | 3474 | RECON_AND_STORE(dest, in25); |
michael@0 | 3475 | RECON_AND_STORE(dest, in26); |
michael@0 | 3476 | RECON_AND_STORE(dest, in27); |
michael@0 | 3477 | RECON_AND_STORE(dest, in28); |
michael@0 | 3478 | RECON_AND_STORE(dest, in29); |
michael@0 | 3479 | RECON_AND_STORE(dest, in30); |
michael@0 | 3480 | RECON_AND_STORE(dest, in31); |
michael@0 | 3481 | |
michael@0 | 3482 | dest += 8 - (stride * 32); |
michael@0 | 3483 | } |
michael@0 | 3484 | } |
michael@0 | 3485 | } |
michael@0 | 3486 | |
michael@0 | 3487 | void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, |
michael@0 | 3488 | int stride) { |
michael@0 | 3489 | const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
michael@0 | 3490 | const __m128i final_rounding = _mm_set1_epi16(1<<5); |
michael@0 | 3491 | |
michael@0 | 3492 | // idct constants for each stage |
michael@0 | 3493 | const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
michael@0 | 3494 | const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); |
michael@0 | 3495 | const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); |
michael@0 | 3496 | const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); |
michael@0 | 3497 | const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); |
michael@0 | 3498 | const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); |
michael@0 | 3499 | const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); |
michael@0 | 3500 | const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); |
michael@0 | 3501 | const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); |
michael@0 | 3502 | const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); |
michael@0 | 3503 | const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); |
michael@0 | 3504 | const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); |
michael@0 | 3505 | const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); |
michael@0 | 3506 | const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); |
michael@0 | 3507 | const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); |
michael@0 | 3508 | const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); |
michael@0 | 3509 | |
michael@0 | 3510 | const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
michael@0 | 3511 | const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); |
michael@0 | 3512 | const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
michael@0 | 3513 | const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); |
michael@0 | 3514 | const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
michael@0 | 3515 | const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); |
michael@0 | 3516 | const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
michael@0 | 3517 | const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); |
michael@0 | 3518 | |
michael@0 | 3519 | const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
michael@0 | 3520 | const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
michael@0 | 3521 | const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); |
michael@0 | 3522 | const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); |
michael@0 | 3523 | const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
michael@0 | 3524 | const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); |
michael@0 | 3525 | const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); |
michael@0 | 3526 | const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
michael@0 | 3527 | const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); |
michael@0 | 3528 | const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); |
michael@0 | 3529 | |
michael@0 | 3530 | const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
michael@0 | 3531 | const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
michael@0 | 3532 | const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
michael@0 | 3533 | const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
michael@0 | 3534 | const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
michael@0 | 3535 | const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); |
michael@0 | 3536 | const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
michael@0 | 3537 | |
michael@0 | 3538 | const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
michael@0 | 3539 | |
michael@0 | 3540 | __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, |
michael@0 | 3541 | in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, |
michael@0 | 3542 | in24, in25, in26, in27, in28, in29, in30, in31; |
michael@0 | 3543 | __m128i col[128]; |
michael@0 | 3544 | __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, |
michael@0 | 3545 | stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, |
michael@0 | 3546 | stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, |
michael@0 | 3547 | stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, |
michael@0 | 3548 | stp1_30, stp1_31; |
michael@0 | 3549 | __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
michael@0 | 3550 | stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, |
michael@0 | 3551 | stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, |
michael@0 | 3552 | stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, |
michael@0 | 3553 | stp2_30, stp2_31; |
michael@0 | 3554 | __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
michael@0 | 3555 | int i, j, i32; |
michael@0 | 3556 | __m128i zero_idx[16]; |
michael@0 | 3557 | int zero_flag[2]; |
michael@0 | 3558 | |
michael@0 | 3559 | // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. |
michael@0 | 3560 | for (i = 0; i < 8; i++) { |
michael@0 | 3561 | i32 = (i << 5); |
michael@0 | 3562 | if (i < 4) { |
michael@0 | 3563 | // First 1-D idct |
michael@0 | 3564 | // Load input data. |
michael@0 | 3565 | LOAD_DQCOEFF(in0, input); |
michael@0 | 3566 | LOAD_DQCOEFF(in8, input); |
michael@0 | 3567 | LOAD_DQCOEFF(in16, input); |
michael@0 | 3568 | LOAD_DQCOEFF(in24, input); |
michael@0 | 3569 | LOAD_DQCOEFF(in1, input); |
michael@0 | 3570 | LOAD_DQCOEFF(in9, input); |
michael@0 | 3571 | LOAD_DQCOEFF(in17, input); |
michael@0 | 3572 | LOAD_DQCOEFF(in25, input); |
michael@0 | 3573 | LOAD_DQCOEFF(in2, input); |
michael@0 | 3574 | LOAD_DQCOEFF(in10, input); |
michael@0 | 3575 | LOAD_DQCOEFF(in18, input); |
michael@0 | 3576 | LOAD_DQCOEFF(in26, input); |
michael@0 | 3577 | LOAD_DQCOEFF(in3, input); |
michael@0 | 3578 | LOAD_DQCOEFF(in11, input); |
michael@0 | 3579 | LOAD_DQCOEFF(in19, input); |
michael@0 | 3580 | LOAD_DQCOEFF(in27, input); |
michael@0 | 3581 | |
michael@0 | 3582 | LOAD_DQCOEFF(in4, input); |
michael@0 | 3583 | LOAD_DQCOEFF(in12, input); |
michael@0 | 3584 | LOAD_DQCOEFF(in20, input); |
michael@0 | 3585 | LOAD_DQCOEFF(in28, input); |
michael@0 | 3586 | LOAD_DQCOEFF(in5, input); |
michael@0 | 3587 | LOAD_DQCOEFF(in13, input); |
michael@0 | 3588 | LOAD_DQCOEFF(in21, input); |
michael@0 | 3589 | LOAD_DQCOEFF(in29, input); |
michael@0 | 3590 | LOAD_DQCOEFF(in6, input); |
michael@0 | 3591 | LOAD_DQCOEFF(in14, input); |
michael@0 | 3592 | LOAD_DQCOEFF(in22, input); |
michael@0 | 3593 | LOAD_DQCOEFF(in30, input); |
michael@0 | 3594 | LOAD_DQCOEFF(in7, input); |
michael@0 | 3595 | LOAD_DQCOEFF(in15, input); |
michael@0 | 3596 | LOAD_DQCOEFF(in23, input); |
michael@0 | 3597 | LOAD_DQCOEFF(in31, input); |
michael@0 | 3598 | |
michael@0 | 3599 | // checking if all entries are zero |
michael@0 | 3600 | zero_idx[0] = _mm_or_si128(in0, in1); |
michael@0 | 3601 | zero_idx[1] = _mm_or_si128(in2, in3); |
michael@0 | 3602 | zero_idx[2] = _mm_or_si128(in4, in5); |
michael@0 | 3603 | zero_idx[3] = _mm_or_si128(in6, in7); |
michael@0 | 3604 | zero_idx[4] = _mm_or_si128(in8, in9); |
michael@0 | 3605 | zero_idx[5] = _mm_or_si128(in10, in11); |
michael@0 | 3606 | zero_idx[6] = _mm_or_si128(in12, in13); |
michael@0 | 3607 | zero_idx[7] = _mm_or_si128(in14, in15); |
michael@0 | 3608 | zero_idx[8] = _mm_or_si128(in16, in17); |
michael@0 | 3609 | zero_idx[9] = _mm_or_si128(in18, in19); |
michael@0 | 3610 | zero_idx[10] = _mm_or_si128(in20, in21); |
michael@0 | 3611 | zero_idx[11] = _mm_or_si128(in22, in23); |
michael@0 | 3612 | zero_idx[12] = _mm_or_si128(in24, in25); |
michael@0 | 3613 | zero_idx[13] = _mm_or_si128(in26, in27); |
michael@0 | 3614 | zero_idx[14] = _mm_or_si128(in28, in29); |
michael@0 | 3615 | zero_idx[15] = _mm_or_si128(in30, in31); |
michael@0 | 3616 | |
michael@0 | 3617 | zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); |
michael@0 | 3618 | zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); |
michael@0 | 3619 | zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); |
michael@0 | 3620 | zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); |
michael@0 | 3621 | zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); |
michael@0 | 3622 | zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); |
michael@0 | 3623 | zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); |
michael@0 | 3624 | zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); |
michael@0 | 3625 | |
michael@0 | 3626 | zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); |
michael@0 | 3627 | zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); |
michael@0 | 3628 | zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); |
michael@0 | 3629 | zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); |
michael@0 | 3630 | zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); |
michael@0 | 3631 | zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); |
michael@0 | 3632 | zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); |
michael@0 | 3633 | |
michael@0 | 3634 | zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]); |
michael@0 | 3635 | zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]); |
michael@0 | 3636 | zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32); |
michael@0 | 3637 | zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]); |
michael@0 | 3638 | zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]); |
michael@0 | 3639 | |
michael@0 | 3640 | if (!zero_flag[0] && !zero_flag[1]) { |
michael@0 | 3641 | col[i32 + 0] = _mm_setzero_si128(); |
michael@0 | 3642 | col[i32 + 1] = _mm_setzero_si128(); |
michael@0 | 3643 | col[i32 + 2] = _mm_setzero_si128(); |
michael@0 | 3644 | col[i32 + 3] = _mm_setzero_si128(); |
michael@0 | 3645 | col[i32 + 4] = _mm_setzero_si128(); |
michael@0 | 3646 | col[i32 + 5] = _mm_setzero_si128(); |
michael@0 | 3647 | col[i32 + 6] = _mm_setzero_si128(); |
michael@0 | 3648 | col[i32 + 7] = _mm_setzero_si128(); |
michael@0 | 3649 | col[i32 + 8] = _mm_setzero_si128(); |
michael@0 | 3650 | col[i32 + 9] = _mm_setzero_si128(); |
michael@0 | 3651 | col[i32 + 10] = _mm_setzero_si128(); |
michael@0 | 3652 | col[i32 + 11] = _mm_setzero_si128(); |
michael@0 | 3653 | col[i32 + 12] = _mm_setzero_si128(); |
michael@0 | 3654 | col[i32 + 13] = _mm_setzero_si128(); |
michael@0 | 3655 | col[i32 + 14] = _mm_setzero_si128(); |
michael@0 | 3656 | col[i32 + 15] = _mm_setzero_si128(); |
michael@0 | 3657 | col[i32 + 16] = _mm_setzero_si128(); |
michael@0 | 3658 | col[i32 + 17] = _mm_setzero_si128(); |
michael@0 | 3659 | col[i32 + 18] = _mm_setzero_si128(); |
michael@0 | 3660 | col[i32 + 19] = _mm_setzero_si128(); |
michael@0 | 3661 | col[i32 + 20] = _mm_setzero_si128(); |
michael@0 | 3662 | col[i32 + 21] = _mm_setzero_si128(); |
michael@0 | 3663 | col[i32 + 22] = _mm_setzero_si128(); |
michael@0 | 3664 | col[i32 + 23] = _mm_setzero_si128(); |
michael@0 | 3665 | col[i32 + 24] = _mm_setzero_si128(); |
michael@0 | 3666 | col[i32 + 25] = _mm_setzero_si128(); |
michael@0 | 3667 | col[i32 + 26] = _mm_setzero_si128(); |
michael@0 | 3668 | col[i32 + 27] = _mm_setzero_si128(); |
michael@0 | 3669 | col[i32 + 28] = _mm_setzero_si128(); |
michael@0 | 3670 | col[i32 + 29] = _mm_setzero_si128(); |
michael@0 | 3671 | col[i32 + 30] = _mm_setzero_si128(); |
michael@0 | 3672 | col[i32 + 31] = _mm_setzero_si128(); |
michael@0 | 3673 | continue; |
michael@0 | 3674 | } |
michael@0 | 3675 | |
michael@0 | 3676 | // Transpose 32x8 block to 8x32 block |
michael@0 | 3677 | TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, |
michael@0 | 3678 | in4, in5, in6, in7); |
michael@0 | 3679 | TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, |
michael@0 | 3680 | in10, in11, in12, in13, in14, in15); |
michael@0 | 3681 | TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, |
michael@0 | 3682 | in18, in19, in20, in21, in22, in23); |
michael@0 | 3683 | TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, |
michael@0 | 3684 | in26, in27, in28, in29, in30, in31); |
michael@0 | 3685 | } else { |
michael@0 | 3686 | // Second 1-D idct |
michael@0 | 3687 | j = i - 4; |
michael@0 | 3688 | |
michael@0 | 3689 | // Transpose 32x8 block to 8x32 block |
michael@0 | 3690 | TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
michael@0 | 3691 | col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
michael@0 | 3692 | col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, |
michael@0 | 3693 | in5, in6, in7); |
michael@0 | 3694 | j += 4; |
michael@0 | 3695 | TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
michael@0 | 3696 | col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
michael@0 | 3697 | col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, |
michael@0 | 3698 | in11, in12, in13, in14, in15); |
michael@0 | 3699 | j += 4; |
michael@0 | 3700 | TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
michael@0 | 3701 | col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
michael@0 | 3702 | col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, |
michael@0 | 3703 | in19, in20, in21, in22, in23); |
michael@0 | 3704 | j += 4; |
michael@0 | 3705 | TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
michael@0 | 3706 | col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
michael@0 | 3707 | col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, |
michael@0 | 3708 | in28, in29, in30, in31); |
michael@0 | 3709 | } |
michael@0 | 3710 | |
michael@0 | 3711 | IDCT32_1D |
michael@0 | 3712 | |
michael@0 | 3713 | // final stage |
michael@0 | 3714 | if (i < 4) { |
michael@0 | 3715 | // 1_D: Store 32 intermediate results for each 8x32 block. |
michael@0 | 3716 | col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); |
michael@0 | 3717 | col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); |
michael@0 | 3718 | col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); |
michael@0 | 3719 | col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); |
michael@0 | 3720 | col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); |
michael@0 | 3721 | col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); |
michael@0 | 3722 | col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); |
michael@0 | 3723 | col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); |
michael@0 | 3724 | col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); |
michael@0 | 3725 | col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); |
michael@0 | 3726 | col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); |
michael@0 | 3727 | col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); |
michael@0 | 3728 | col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); |
michael@0 | 3729 | col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); |
michael@0 | 3730 | col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); |
michael@0 | 3731 | col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); |
michael@0 | 3732 | col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); |
michael@0 | 3733 | col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); |
michael@0 | 3734 | col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); |
michael@0 | 3735 | col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); |
michael@0 | 3736 | col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); |
michael@0 | 3737 | col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); |
michael@0 | 3738 | col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); |
michael@0 | 3739 | col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); |
michael@0 | 3740 | col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); |
michael@0 | 3741 | col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); |
michael@0 | 3742 | col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); |
michael@0 | 3743 | col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); |
michael@0 | 3744 | col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); |
michael@0 | 3745 | col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); |
michael@0 | 3746 | col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); |
michael@0 | 3747 | col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); |
michael@0 | 3748 | } else { |
michael@0 | 3749 | const __m128i zero = _mm_setzero_si128(); |
michael@0 | 3750 | |
michael@0 | 3751 | // 2_D: Calculate the results and store them to destination. |
michael@0 | 3752 | in0 = _mm_add_epi16(stp1_0, stp1_31); |
michael@0 | 3753 | in1 = _mm_add_epi16(stp1_1, stp1_30); |
michael@0 | 3754 | in2 = _mm_add_epi16(stp1_2, stp1_29); |
michael@0 | 3755 | in3 = _mm_add_epi16(stp1_3, stp1_28); |
michael@0 | 3756 | in4 = _mm_add_epi16(stp1_4, stp1_27); |
michael@0 | 3757 | in5 = _mm_add_epi16(stp1_5, stp1_26); |
michael@0 | 3758 | in6 = _mm_add_epi16(stp1_6, stp1_25); |
michael@0 | 3759 | in7 = _mm_add_epi16(stp1_7, stp1_24); |
michael@0 | 3760 | in8 = _mm_add_epi16(stp1_8, stp1_23); |
michael@0 | 3761 | in9 = _mm_add_epi16(stp1_9, stp1_22); |
michael@0 | 3762 | in10 = _mm_add_epi16(stp1_10, stp1_21); |
michael@0 | 3763 | in11 = _mm_add_epi16(stp1_11, stp1_20); |
michael@0 | 3764 | in12 = _mm_add_epi16(stp1_12, stp1_19); |
michael@0 | 3765 | in13 = _mm_add_epi16(stp1_13, stp1_18); |
michael@0 | 3766 | in14 = _mm_add_epi16(stp1_14, stp1_17); |
michael@0 | 3767 | in15 = _mm_add_epi16(stp1_15, stp1_16); |
michael@0 | 3768 | in16 = _mm_sub_epi16(stp1_15, stp1_16); |
michael@0 | 3769 | in17 = _mm_sub_epi16(stp1_14, stp1_17); |
michael@0 | 3770 | in18 = _mm_sub_epi16(stp1_13, stp1_18); |
michael@0 | 3771 | in19 = _mm_sub_epi16(stp1_12, stp1_19); |
michael@0 | 3772 | in20 = _mm_sub_epi16(stp1_11, stp1_20); |
michael@0 | 3773 | in21 = _mm_sub_epi16(stp1_10, stp1_21); |
michael@0 | 3774 | in22 = _mm_sub_epi16(stp1_9, stp1_22); |
michael@0 | 3775 | in23 = _mm_sub_epi16(stp1_8, stp1_23); |
michael@0 | 3776 | in24 = _mm_sub_epi16(stp1_7, stp1_24); |
michael@0 | 3777 | in25 = _mm_sub_epi16(stp1_6, stp1_25); |
michael@0 | 3778 | in26 = _mm_sub_epi16(stp1_5, stp1_26); |
michael@0 | 3779 | in27 = _mm_sub_epi16(stp1_4, stp1_27); |
michael@0 | 3780 | in28 = _mm_sub_epi16(stp1_3, stp1_28); |
michael@0 | 3781 | in29 = _mm_sub_epi16(stp1_2, stp1_29); |
michael@0 | 3782 | in30 = _mm_sub_epi16(stp1_1, stp1_30); |
michael@0 | 3783 | in31 = _mm_sub_epi16(stp1_0, stp1_31); |
michael@0 | 3784 | |
michael@0 | 3785 | // Final rounding and shift |
michael@0 | 3786 | in0 = _mm_adds_epi16(in0, final_rounding); |
michael@0 | 3787 | in1 = _mm_adds_epi16(in1, final_rounding); |
michael@0 | 3788 | in2 = _mm_adds_epi16(in2, final_rounding); |
michael@0 | 3789 | in3 = _mm_adds_epi16(in3, final_rounding); |
michael@0 | 3790 | in4 = _mm_adds_epi16(in4, final_rounding); |
michael@0 | 3791 | in5 = _mm_adds_epi16(in5, final_rounding); |
michael@0 | 3792 | in6 = _mm_adds_epi16(in6, final_rounding); |
michael@0 | 3793 | in7 = _mm_adds_epi16(in7, final_rounding); |
michael@0 | 3794 | in8 = _mm_adds_epi16(in8, final_rounding); |
michael@0 | 3795 | in9 = _mm_adds_epi16(in9, final_rounding); |
michael@0 | 3796 | in10 = _mm_adds_epi16(in10, final_rounding); |
michael@0 | 3797 | in11 = _mm_adds_epi16(in11, final_rounding); |
michael@0 | 3798 | in12 = _mm_adds_epi16(in12, final_rounding); |
michael@0 | 3799 | in13 = _mm_adds_epi16(in13, final_rounding); |
michael@0 | 3800 | in14 = _mm_adds_epi16(in14, final_rounding); |
michael@0 | 3801 | in15 = _mm_adds_epi16(in15, final_rounding); |
michael@0 | 3802 | in16 = _mm_adds_epi16(in16, final_rounding); |
michael@0 | 3803 | in17 = _mm_adds_epi16(in17, final_rounding); |
michael@0 | 3804 | in18 = _mm_adds_epi16(in18, final_rounding); |
michael@0 | 3805 | in19 = _mm_adds_epi16(in19, final_rounding); |
michael@0 | 3806 | in20 = _mm_adds_epi16(in20, final_rounding); |
michael@0 | 3807 | in21 = _mm_adds_epi16(in21, final_rounding); |
michael@0 | 3808 | in22 = _mm_adds_epi16(in22, final_rounding); |
michael@0 | 3809 | in23 = _mm_adds_epi16(in23, final_rounding); |
michael@0 | 3810 | in24 = _mm_adds_epi16(in24, final_rounding); |
michael@0 | 3811 | in25 = _mm_adds_epi16(in25, final_rounding); |
michael@0 | 3812 | in26 = _mm_adds_epi16(in26, final_rounding); |
michael@0 | 3813 | in27 = _mm_adds_epi16(in27, final_rounding); |
michael@0 | 3814 | in28 = _mm_adds_epi16(in28, final_rounding); |
michael@0 | 3815 | in29 = _mm_adds_epi16(in29, final_rounding); |
michael@0 | 3816 | in30 = _mm_adds_epi16(in30, final_rounding); |
michael@0 | 3817 | in31 = _mm_adds_epi16(in31, final_rounding); |
michael@0 | 3818 | |
michael@0 | 3819 | in0 = _mm_srai_epi16(in0, 6); |
michael@0 | 3820 | in1 = _mm_srai_epi16(in1, 6); |
michael@0 | 3821 | in2 = _mm_srai_epi16(in2, 6); |
michael@0 | 3822 | in3 = _mm_srai_epi16(in3, 6); |
michael@0 | 3823 | in4 = _mm_srai_epi16(in4, 6); |
michael@0 | 3824 | in5 = _mm_srai_epi16(in5, 6); |
michael@0 | 3825 | in6 = _mm_srai_epi16(in6, 6); |
michael@0 | 3826 | in7 = _mm_srai_epi16(in7, 6); |
michael@0 | 3827 | in8 = _mm_srai_epi16(in8, 6); |
michael@0 | 3828 | in9 = _mm_srai_epi16(in9, 6); |
michael@0 | 3829 | in10 = _mm_srai_epi16(in10, 6); |
michael@0 | 3830 | in11 = _mm_srai_epi16(in11, 6); |
michael@0 | 3831 | in12 = _mm_srai_epi16(in12, 6); |
michael@0 | 3832 | in13 = _mm_srai_epi16(in13, 6); |
michael@0 | 3833 | in14 = _mm_srai_epi16(in14, 6); |
michael@0 | 3834 | in15 = _mm_srai_epi16(in15, 6); |
michael@0 | 3835 | in16 = _mm_srai_epi16(in16, 6); |
michael@0 | 3836 | in17 = _mm_srai_epi16(in17, 6); |
michael@0 | 3837 | in18 = _mm_srai_epi16(in18, 6); |
michael@0 | 3838 | in19 = _mm_srai_epi16(in19, 6); |
michael@0 | 3839 | in20 = _mm_srai_epi16(in20, 6); |
michael@0 | 3840 | in21 = _mm_srai_epi16(in21, 6); |
michael@0 | 3841 | in22 = _mm_srai_epi16(in22, 6); |
michael@0 | 3842 | in23 = _mm_srai_epi16(in23, 6); |
michael@0 | 3843 | in24 = _mm_srai_epi16(in24, 6); |
michael@0 | 3844 | in25 = _mm_srai_epi16(in25, 6); |
michael@0 | 3845 | in26 = _mm_srai_epi16(in26, 6); |
michael@0 | 3846 | in27 = _mm_srai_epi16(in27, 6); |
michael@0 | 3847 | in28 = _mm_srai_epi16(in28, 6); |
michael@0 | 3848 | in29 = _mm_srai_epi16(in29, 6); |
michael@0 | 3849 | in30 = _mm_srai_epi16(in30, 6); |
michael@0 | 3850 | in31 = _mm_srai_epi16(in31, 6); |
michael@0 | 3851 | |
michael@0 | 3852 | RECON_AND_STORE(dest, in0); |
michael@0 | 3853 | RECON_AND_STORE(dest, in1); |
michael@0 | 3854 | RECON_AND_STORE(dest, in2); |
michael@0 | 3855 | RECON_AND_STORE(dest, in3); |
michael@0 | 3856 | RECON_AND_STORE(dest, in4); |
michael@0 | 3857 | RECON_AND_STORE(dest, in5); |
michael@0 | 3858 | RECON_AND_STORE(dest, in6); |
michael@0 | 3859 | RECON_AND_STORE(dest, in7); |
michael@0 | 3860 | RECON_AND_STORE(dest, in8); |
michael@0 | 3861 | RECON_AND_STORE(dest, in9); |
michael@0 | 3862 | RECON_AND_STORE(dest, in10); |
michael@0 | 3863 | RECON_AND_STORE(dest, in11); |
michael@0 | 3864 | RECON_AND_STORE(dest, in12); |
michael@0 | 3865 | RECON_AND_STORE(dest, in13); |
michael@0 | 3866 | RECON_AND_STORE(dest, in14); |
michael@0 | 3867 | RECON_AND_STORE(dest, in15); |
michael@0 | 3868 | RECON_AND_STORE(dest, in16); |
michael@0 | 3869 | RECON_AND_STORE(dest, in17); |
michael@0 | 3870 | RECON_AND_STORE(dest, in18); |
michael@0 | 3871 | RECON_AND_STORE(dest, in19); |
michael@0 | 3872 | RECON_AND_STORE(dest, in20); |
michael@0 | 3873 | RECON_AND_STORE(dest, in21); |
michael@0 | 3874 | RECON_AND_STORE(dest, in22); |
michael@0 | 3875 | RECON_AND_STORE(dest, in23); |
michael@0 | 3876 | RECON_AND_STORE(dest, in24); |
michael@0 | 3877 | RECON_AND_STORE(dest, in25); |
michael@0 | 3878 | RECON_AND_STORE(dest, in26); |
michael@0 | 3879 | RECON_AND_STORE(dest, in27); |
michael@0 | 3880 | RECON_AND_STORE(dest, in28); |
michael@0 | 3881 | RECON_AND_STORE(dest, in29); |
michael@0 | 3882 | RECON_AND_STORE(dest, in30); |
michael@0 | 3883 | RECON_AND_STORE(dest, in31); |
michael@0 | 3884 | |
michael@0 | 3885 | dest += 8 - (stride * 32); |
michael@0 | 3886 | } |
michael@0 | 3887 | } |
michael@0 | 3888 | } //NOLINT |
michael@0 | 3889 | |
michael@0 | 3890 | void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
michael@0 | 3891 | __m128i dc_value; |
michael@0 | 3892 | const __m128i zero = _mm_setzero_si128(); |
michael@0 | 3893 | int a, i; |
michael@0 | 3894 | |
michael@0 | 3895 | a = dct_const_round_shift(input[0] * cospi_16_64); |
michael@0 | 3896 | a = dct_const_round_shift(a * cospi_16_64); |
michael@0 | 3897 | a = ROUND_POWER_OF_TWO(a, 6); |
michael@0 | 3898 | |
michael@0 | 3899 | dc_value = _mm_set1_epi16(a); |
michael@0 | 3900 | |
michael@0 | 3901 | for (i = 0; i < 4; ++i) { |
michael@0 | 3902 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3903 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3904 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3905 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3906 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3907 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3908 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3909 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3910 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3911 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3912 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3913 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3914 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3915 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3916 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3917 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3918 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3919 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3920 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3921 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3922 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3923 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3924 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3925 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3926 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3927 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3928 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3929 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3930 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3931 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3932 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3933 | RECON_AND_STORE(dest, dc_value); |
michael@0 | 3934 | dest += 8 - (stride * 32); |
michael@0 | 3935 | } |
michael@0 | 3936 | } |