media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
michael@0 3 *
michael@0 4 * Use of this source code is governed by a BSD-style license
michael@0 5 * that can be found in the LICENSE file in the root of the source
michael@0 6 * tree. An additional intellectual property rights grant can be found
michael@0 7 * in the file PATENTS. All contributing project authors may
michael@0 8 * be found in the AUTHORS file in the root of the source tree.
michael@0 9 */
michael@0 10
michael@0 11 #include <assert.h>
michael@0 12 #include <emmintrin.h> // SSE2
michael@0 13 #include "./vpx_config.h"
michael@0 14 #include "vpx/vpx_integer.h"
michael@0 15 #include "vp9/common/vp9_common.h"
michael@0 16 #include "vp9/common/vp9_idct.h"
michael@0 17
michael@0 18 #define RECON_AND_STORE4X4(dest, in_x) \
michael@0 19 { \
michael@0 20 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
michael@0 21 d0 = _mm_unpacklo_epi8(d0, zero); \
michael@0 22 d0 = _mm_add_epi16(in_x, d0); \
michael@0 23 d0 = _mm_packus_epi16(d0, d0); \
michael@0 24 *(int *)dest = _mm_cvtsi128_si32(d0); \
michael@0 25 dest += stride; \
michael@0 26 }
michael@0 27
michael@0 28 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
michael@0 29 const __m128i zero = _mm_setzero_si128();
michael@0 30 const __m128i eight = _mm_set1_epi16(8);
michael@0 31 const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
michael@0 32 (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
michael@0 33 (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
michael@0 34 (int16_t)cospi_8_64, (int16_t)cospi_24_64);
michael@0 35 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 36 __m128i input0, input1, input2, input3;
michael@0 37
michael@0 38 // Rows
michael@0 39 input0 = _mm_load_si128((const __m128i *)input);
michael@0 40 input2 = _mm_load_si128((const __m128i *)(input + 8));
michael@0 41
michael@0 42 // Construct i3, i1, i3, i1, i2, i0, i2, i0
michael@0 43 input0 = _mm_shufflelo_epi16(input0, 0xd8);
michael@0 44 input0 = _mm_shufflehi_epi16(input0, 0xd8);
michael@0 45 input2 = _mm_shufflelo_epi16(input2, 0xd8);
michael@0 46 input2 = _mm_shufflehi_epi16(input2, 0xd8);
michael@0 47
michael@0 48 input1 = _mm_unpackhi_epi32(input0, input0);
michael@0 49 input0 = _mm_unpacklo_epi32(input0, input0);
michael@0 50 input3 = _mm_unpackhi_epi32(input2, input2);
michael@0 51 input2 = _mm_unpacklo_epi32(input2, input2);
michael@0 52
michael@0 53 // Stage 1
michael@0 54 input0 = _mm_madd_epi16(input0, cst);
michael@0 55 input1 = _mm_madd_epi16(input1, cst);
michael@0 56 input2 = _mm_madd_epi16(input2, cst);
michael@0 57 input3 = _mm_madd_epi16(input3, cst);
michael@0 58
michael@0 59 input0 = _mm_add_epi32(input0, rounding);
michael@0 60 input1 = _mm_add_epi32(input1, rounding);
michael@0 61 input2 = _mm_add_epi32(input2, rounding);
michael@0 62 input3 = _mm_add_epi32(input3, rounding);
michael@0 63
michael@0 64 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
michael@0 65 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
michael@0 66 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
michael@0 67 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
michael@0 68
michael@0 69 // Stage 2
michael@0 70 input0 = _mm_packs_epi32(input0, input1);
michael@0 71 input1 = _mm_packs_epi32(input2, input3);
michael@0 72
michael@0 73 // Transpose
michael@0 74 input2 = _mm_unpacklo_epi16(input0, input1);
michael@0 75 input3 = _mm_unpackhi_epi16(input0, input1);
michael@0 76 input0 = _mm_unpacklo_epi32(input2, input3);
michael@0 77 input1 = _mm_unpackhi_epi32(input2, input3);
michael@0 78
michael@0 79 // Switch column2, column 3, and then, we got:
michael@0 80 // input2: column1, column 0; input3: column2, column 3.
michael@0 81 input1 = _mm_shuffle_epi32(input1, 0x4e);
michael@0 82 input2 = _mm_add_epi16(input0, input1);
michael@0 83 input3 = _mm_sub_epi16(input0, input1);
michael@0 84
michael@0 85 // Columns
michael@0 86 // Construct i3, i1, i3, i1, i2, i0, i2, i0
michael@0 87 input0 = _mm_unpacklo_epi32(input2, input2);
michael@0 88 input1 = _mm_unpackhi_epi32(input2, input2);
michael@0 89 input2 = _mm_unpackhi_epi32(input3, input3);
michael@0 90 input3 = _mm_unpacklo_epi32(input3, input3);
michael@0 91
michael@0 92 // Stage 1
michael@0 93 input0 = _mm_madd_epi16(input0, cst);
michael@0 94 input1 = _mm_madd_epi16(input1, cst);
michael@0 95 input2 = _mm_madd_epi16(input2, cst);
michael@0 96 input3 = _mm_madd_epi16(input3, cst);
michael@0 97
michael@0 98 input0 = _mm_add_epi32(input0, rounding);
michael@0 99 input1 = _mm_add_epi32(input1, rounding);
michael@0 100 input2 = _mm_add_epi32(input2, rounding);
michael@0 101 input3 = _mm_add_epi32(input3, rounding);
michael@0 102
michael@0 103 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
michael@0 104 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
michael@0 105 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
michael@0 106 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
michael@0 107
michael@0 108 // Stage 2
michael@0 109 input0 = _mm_packs_epi32(input0, input2);
michael@0 110 input1 = _mm_packs_epi32(input1, input3);
michael@0 111
michael@0 112 // Transpose
michael@0 113 input2 = _mm_unpacklo_epi16(input0, input1);
michael@0 114 input3 = _mm_unpackhi_epi16(input0, input1);
michael@0 115 input0 = _mm_unpacklo_epi32(input2, input3);
michael@0 116 input1 = _mm_unpackhi_epi32(input2, input3);
michael@0 117
michael@0 118 // Switch column2, column 3, and then, we got:
michael@0 119 // input2: column1, column 0; input3: column2, column 3.
michael@0 120 input1 = _mm_shuffle_epi32(input1, 0x4e);
michael@0 121 input2 = _mm_add_epi16(input0, input1);
michael@0 122 input3 = _mm_sub_epi16(input0, input1);
michael@0 123
michael@0 124 // Final round and shift
michael@0 125 input2 = _mm_add_epi16(input2, eight);
michael@0 126 input3 = _mm_add_epi16(input3, eight);
michael@0 127
michael@0 128 input2 = _mm_srai_epi16(input2, 4);
michael@0 129 input3 = _mm_srai_epi16(input3, 4);
michael@0 130
michael@0 131 // Reconstruction and Store
michael@0 132 {
michael@0 133 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
michael@0 134 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
michael@0 135 d0 = _mm_unpacklo_epi32(d0,
michael@0 136 _mm_cvtsi32_si128(*(const int *) (dest + stride)));
michael@0 137 d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(
michael@0 138 *(const int *) (dest + stride * 3)), d2);
michael@0 139 d0 = _mm_unpacklo_epi8(d0, zero);
michael@0 140 d2 = _mm_unpacklo_epi8(d2, zero);
michael@0 141 d0 = _mm_add_epi16(d0, input2);
michael@0 142 d2 = _mm_add_epi16(d2, input3);
michael@0 143 d0 = _mm_packus_epi16(d0, d2);
michael@0 144 // store input0
michael@0 145 *(int *)dest = _mm_cvtsi128_si32(d0);
michael@0 146 // store input1
michael@0 147 d0 = _mm_srli_si128(d0, 4);
michael@0 148 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
michael@0 149 // store input2
michael@0 150 d0 = _mm_srli_si128(d0, 4);
michael@0 151 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
michael@0 152 // store input3
michael@0 153 d0 = _mm_srli_si128(d0, 4);
michael@0 154 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
michael@0 155 }
michael@0 156 }
michael@0 157
michael@0 158 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
michael@0 159 __m128i dc_value;
michael@0 160 const __m128i zero = _mm_setzero_si128();
michael@0 161 int a;
michael@0 162
michael@0 163 a = dct_const_round_shift(input[0] * cospi_16_64);
michael@0 164 a = dct_const_round_shift(a * cospi_16_64);
michael@0 165 a = ROUND_POWER_OF_TWO(a, 4);
michael@0 166
michael@0 167 dc_value = _mm_set1_epi16(a);
michael@0 168
michael@0 169 RECON_AND_STORE4X4(dest, dc_value);
michael@0 170 RECON_AND_STORE4X4(dest, dc_value);
michael@0 171 RECON_AND_STORE4X4(dest, dc_value);
michael@0 172 RECON_AND_STORE4X4(dest, dc_value);
michael@0 173 }
michael@0 174
michael@0 175 static INLINE void transpose_4x4(__m128i *res) {
michael@0 176 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
michael@0 177 const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
michael@0 178 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
michael@0 179 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
michael@0 180
michael@0 181 res[1] = _mm_unpackhi_epi64(res[0], res[0]);
michael@0 182 res[3] = _mm_unpackhi_epi64(res[2], res[2]);
michael@0 183 }
michael@0 184
michael@0 185 static void idct4_1d_sse2(__m128i *in) {
michael@0 186 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
michael@0 187 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 188 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
michael@0 189 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
michael@0 190 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 191 __m128i u[8], v[8];
michael@0 192
michael@0 193 transpose_4x4(in);
michael@0 194 // stage 1
michael@0 195 u[0] = _mm_unpacklo_epi16(in[0], in[2]);
michael@0 196 u[1] = _mm_unpacklo_epi16(in[1], in[3]);
michael@0 197 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
michael@0 198 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
michael@0 199 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
michael@0 200 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
michael@0 201
michael@0 202 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
michael@0 203 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
michael@0 204 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
michael@0 205 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
michael@0 206
michael@0 207 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
michael@0 208 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
michael@0 209 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
michael@0 210 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
michael@0 211
michael@0 212 u[0] = _mm_packs_epi32(v[0], v[2]);
michael@0 213 u[1] = _mm_packs_epi32(v[1], v[3]);
michael@0 214 u[2] = _mm_unpackhi_epi64(u[0], u[0]);
michael@0 215 u[3] = _mm_unpackhi_epi64(u[1], u[1]);
michael@0 216
michael@0 217 // stage 2
michael@0 218 in[0] = _mm_add_epi16(u[0], u[3]);
michael@0 219 in[1] = _mm_add_epi16(u[1], u[2]);
michael@0 220 in[2] = _mm_sub_epi16(u[1], u[2]);
michael@0 221 in[3] = _mm_sub_epi16(u[0], u[3]);
michael@0 222 }
michael@0 223
michael@0 224 static void iadst4_1d_sse2(__m128i *in) {
michael@0 225 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
michael@0 226 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
michael@0 227 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
michael@0 228 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
michael@0 229 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
michael@0 230 const __m128i kZero = _mm_set1_epi16(0);
michael@0 231 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 232 __m128i u[8], v[8], in7;
michael@0 233
michael@0 234 transpose_4x4(in);
michael@0 235 in7 = _mm_add_epi16(in[0], in[3]);
michael@0 236 in7 = _mm_sub_epi16(in7, in[2]);
michael@0 237
michael@0 238 u[0] = _mm_unpacklo_epi16(in[0], in[2]);
michael@0 239 u[1] = _mm_unpacklo_epi16(in[1], in[3]);
michael@0 240 u[2] = _mm_unpacklo_epi16(in7, kZero);
michael@0 241 u[3] = _mm_unpacklo_epi16(in[1], kZero);
michael@0 242
michael@0 243 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
michael@0 244 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
michael@0 245 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
michael@0 246 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
michael@0 247 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
michael@0 248 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
michael@0 249
michael@0 250 u[0] = _mm_add_epi32(v[0], v[1]);
michael@0 251 u[1] = _mm_add_epi32(v[3], v[4]);
michael@0 252 u[2] = v[2];
michael@0 253 u[3] = _mm_add_epi32(u[0], u[1]);
michael@0 254 u[4] = _mm_slli_epi32(v[5], 2);
michael@0 255 u[5] = _mm_add_epi32(u[3], v[5]);
michael@0 256 u[6] = _mm_sub_epi32(u[5], u[4]);
michael@0 257
michael@0 258 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
michael@0 259 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
michael@0 260 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
michael@0 261 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
michael@0 262
michael@0 263 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
michael@0 264 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
michael@0 265 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
michael@0 266 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
michael@0 267
michael@0 268 in[0] = _mm_packs_epi32(u[0], u[2]);
michael@0 269 in[1] = _mm_packs_epi32(u[1], u[3]);
michael@0 270 in[2] = _mm_unpackhi_epi64(in[0], in[0]);
michael@0 271 in[3] = _mm_unpackhi_epi64(in[1], in[1]);
michael@0 272 }
michael@0 273
michael@0 274 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
michael@0 275 int tx_type) {
michael@0 276 __m128i in[4];
michael@0 277 const __m128i zero = _mm_setzero_si128();
michael@0 278 const __m128i eight = _mm_set1_epi16(8);
michael@0 279
michael@0 280 in[0] = _mm_loadl_epi64((const __m128i *)input);
michael@0 281 in[1] = _mm_loadl_epi64((const __m128i *)(input + 4));
michael@0 282 in[2] = _mm_loadl_epi64((const __m128i *)(input + 8));
michael@0 283 in[3] = _mm_loadl_epi64((const __m128i *)(input + 12));
michael@0 284
michael@0 285 switch (tx_type) {
michael@0 286 case 0: // DCT_DCT
michael@0 287 idct4_1d_sse2(in);
michael@0 288 idct4_1d_sse2(in);
michael@0 289 break;
michael@0 290 case 1: // ADST_DCT
michael@0 291 idct4_1d_sse2(in);
michael@0 292 iadst4_1d_sse2(in);
michael@0 293 break;
michael@0 294 case 2: // DCT_ADST
michael@0 295 iadst4_1d_sse2(in);
michael@0 296 idct4_1d_sse2(in);
michael@0 297 break;
michael@0 298 case 3: // ADST_ADST
michael@0 299 iadst4_1d_sse2(in);
michael@0 300 iadst4_1d_sse2(in);
michael@0 301 break;
michael@0 302 default:
michael@0 303 assert(0);
michael@0 304 break;
michael@0 305 }
michael@0 306
michael@0 307 // Final round and shift
michael@0 308 in[0] = _mm_add_epi16(in[0], eight);
michael@0 309 in[1] = _mm_add_epi16(in[1], eight);
michael@0 310 in[2] = _mm_add_epi16(in[2], eight);
michael@0 311 in[3] = _mm_add_epi16(in[3], eight);
michael@0 312
michael@0 313 in[0] = _mm_srai_epi16(in[0], 4);
michael@0 314 in[1] = _mm_srai_epi16(in[1], 4);
michael@0 315 in[2] = _mm_srai_epi16(in[2], 4);
michael@0 316 in[3] = _mm_srai_epi16(in[3], 4);
michael@0 317
michael@0 318 RECON_AND_STORE4X4(dest, in[0]);
michael@0 319 RECON_AND_STORE4X4(dest, in[1]);
michael@0 320 RECON_AND_STORE4X4(dest, in[2]);
michael@0 321 RECON_AND_STORE4X4(dest, in[3]);
michael@0 322 }
michael@0 323
michael@0 324 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
michael@0 325 out0, out1, out2, out3, out4, out5, out6, out7) \
michael@0 326 { \
michael@0 327 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
michael@0 328 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
michael@0 329 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
michael@0 330 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
michael@0 331 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
michael@0 332 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
michael@0 333 const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
michael@0 334 const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
michael@0 335 \
michael@0 336 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
michael@0 337 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
michael@0 338 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
michael@0 339 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
michael@0 340 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
michael@0 341 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
michael@0 342 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
michael@0 343 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
michael@0 344 \
michael@0 345 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
michael@0 346 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
michael@0 347 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
michael@0 348 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
michael@0 349 out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
michael@0 350 out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
michael@0 351 out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
michael@0 352 out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
michael@0 353 }
michael@0 354
michael@0 355 #define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \
michael@0 356 out0, out1, out2, out3, out4, out5, out6, out7) \
michael@0 357 { \
michael@0 358 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
michael@0 359 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
michael@0 360 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
michael@0 361 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
michael@0 362 \
michael@0 363 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
michael@0 364 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
michael@0 365 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
michael@0 366 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
michael@0 367 \
michael@0 368 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
michael@0 369 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
michael@0 370 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
michael@0 371 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
michael@0 372 out4 = out5 = out6 = out7 = zero; \
michael@0 373 }
michael@0 374
michael@0 375 #define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \
michael@0 376 { \
michael@0 377 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
michael@0 378 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
michael@0 379 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
michael@0 380 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
michael@0 381 \
michael@0 382 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
michael@0 383 in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
michael@0 384 in2 = _mm_unpacklo_epi32(tr0_2, tr0_3); /* i5 i4 */ \
michael@0 385 in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \
michael@0 386 }
michael@0 387
michael@0 388 // Define Macro for multiplying elements by constants and adding them together.
michael@0 389 #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
michael@0 390 cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
michael@0 391 { \
michael@0 392 tmp0 = _mm_madd_epi16(lo_0, cst0); \
michael@0 393 tmp1 = _mm_madd_epi16(hi_0, cst0); \
michael@0 394 tmp2 = _mm_madd_epi16(lo_0, cst1); \
michael@0 395 tmp3 = _mm_madd_epi16(hi_0, cst1); \
michael@0 396 tmp4 = _mm_madd_epi16(lo_1, cst2); \
michael@0 397 tmp5 = _mm_madd_epi16(hi_1, cst2); \
michael@0 398 tmp6 = _mm_madd_epi16(lo_1, cst3); \
michael@0 399 tmp7 = _mm_madd_epi16(hi_1, cst3); \
michael@0 400 \
michael@0 401 tmp0 = _mm_add_epi32(tmp0, rounding); \
michael@0 402 tmp1 = _mm_add_epi32(tmp1, rounding); \
michael@0 403 tmp2 = _mm_add_epi32(tmp2, rounding); \
michael@0 404 tmp3 = _mm_add_epi32(tmp3, rounding); \
michael@0 405 tmp4 = _mm_add_epi32(tmp4, rounding); \
michael@0 406 tmp5 = _mm_add_epi32(tmp5, rounding); \
michael@0 407 tmp6 = _mm_add_epi32(tmp6, rounding); \
michael@0 408 tmp7 = _mm_add_epi32(tmp7, rounding); \
michael@0 409 \
michael@0 410 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
michael@0 411 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
michael@0 412 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
michael@0 413 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
michael@0 414 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
michael@0 415 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
michael@0 416 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
michael@0 417 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
michael@0 418 \
michael@0 419 res0 = _mm_packs_epi32(tmp0, tmp1); \
michael@0 420 res1 = _mm_packs_epi32(tmp2, tmp3); \
michael@0 421 res2 = _mm_packs_epi32(tmp4, tmp5); \
michael@0 422 res3 = _mm_packs_epi32(tmp6, tmp7); \
michael@0 423 }
michael@0 424
michael@0 425 #define IDCT8_1D \
michael@0 426 /* Stage1 */ \
michael@0 427 { \
michael@0 428 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
michael@0 429 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
michael@0 430 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
michael@0 431 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
michael@0 432 \
michael@0 433 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
michael@0 434 stg1_1, stg1_2, stg1_3, stp1_4, \
michael@0 435 stp1_7, stp1_5, stp1_6) \
michael@0 436 } \
michael@0 437 \
michael@0 438 /* Stage2 */ \
michael@0 439 { \
michael@0 440 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
michael@0 441 const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
michael@0 442 const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
michael@0 443 const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
michael@0 444 \
michael@0 445 MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
michael@0 446 stg2_1, stg2_2, stg2_3, stp2_0, \
michael@0 447 stp2_1, stp2_2, stp2_3) \
michael@0 448 \
michael@0 449 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
michael@0 450 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
michael@0 451 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
michael@0 452 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
michael@0 453 } \
michael@0 454 \
michael@0 455 /* Stage3 */ \
michael@0 456 { \
michael@0 457 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
michael@0 458 const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
michael@0 459 \
michael@0 460 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
michael@0 461 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
michael@0 462 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
michael@0 463 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
michael@0 464 \
michael@0 465 tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
michael@0 466 tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
michael@0 467 tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
michael@0 468 tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
michael@0 469 \
michael@0 470 tmp0 = _mm_add_epi32(tmp0, rounding); \
michael@0 471 tmp1 = _mm_add_epi32(tmp1, rounding); \
michael@0 472 tmp2 = _mm_add_epi32(tmp2, rounding); \
michael@0 473 tmp3 = _mm_add_epi32(tmp3, rounding); \
michael@0 474 \
michael@0 475 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
michael@0 476 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
michael@0 477 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
michael@0 478 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
michael@0 479 \
michael@0 480 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
michael@0 481 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
michael@0 482 } \
michael@0 483 \
michael@0 484 /* Stage4 */ \
michael@0 485 in0 = _mm_adds_epi16(stp1_0, stp2_7); \
michael@0 486 in1 = _mm_adds_epi16(stp1_1, stp1_6); \
michael@0 487 in2 = _mm_adds_epi16(stp1_2, stp1_5); \
michael@0 488 in3 = _mm_adds_epi16(stp1_3, stp2_4); \
michael@0 489 in4 = _mm_subs_epi16(stp1_3, stp2_4); \
michael@0 490 in5 = _mm_subs_epi16(stp1_2, stp1_5); \
michael@0 491 in6 = _mm_subs_epi16(stp1_1, stp1_6); \
michael@0 492 in7 = _mm_subs_epi16(stp1_0, stp2_7);
michael@0 493
michael@0 494 #define RECON_AND_STORE(dest, in_x) \
michael@0 495 { \
michael@0 496 __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
michael@0 497 d0 = _mm_unpacklo_epi8(d0, zero); \
michael@0 498 d0 = _mm_add_epi16(in_x, d0); \
michael@0 499 d0 = _mm_packus_epi16(d0, d0); \
michael@0 500 _mm_storel_epi64((__m128i *)(dest), d0); \
michael@0 501 dest += stride; \
michael@0 502 }
michael@0 503
michael@0 504 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
michael@0 505 const __m128i zero = _mm_setzero_si128();
michael@0 506 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 507 const __m128i final_rounding = _mm_set1_epi16(1<<4);
michael@0 508 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
michael@0 509 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
michael@0 510 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
michael@0 511 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
michael@0 512 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
michael@0 513 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 514 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
michael@0 515 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
michael@0 516
michael@0 517 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
michael@0 518 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
michael@0 519 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
michael@0 520 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
michael@0 521 int i;
michael@0 522
michael@0 523 // Load input data.
michael@0 524 in0 = _mm_load_si128((const __m128i *)input);
michael@0 525 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
michael@0 526 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
michael@0 527 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
michael@0 528 in4 = _mm_load_si128((const __m128i *)(input + 8 * 4));
michael@0 529 in5 = _mm_load_si128((const __m128i *)(input + 8 * 5));
michael@0 530 in6 = _mm_load_si128((const __m128i *)(input + 8 * 6));
michael@0 531 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7));
michael@0 532
michael@0 533 // 2-D
michael@0 534 for (i = 0; i < 2; i++) {
michael@0 535 // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
michael@0 536 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
michael@0 537 in4, in5, in6, in7);
michael@0 538
michael@0 539 // 4-stage 1D idct8x8
michael@0 540 IDCT8_1D
michael@0 541 }
michael@0 542
michael@0 543 // Final rounding and shift
michael@0 544 in0 = _mm_adds_epi16(in0, final_rounding);
michael@0 545 in1 = _mm_adds_epi16(in1, final_rounding);
michael@0 546 in2 = _mm_adds_epi16(in2, final_rounding);
michael@0 547 in3 = _mm_adds_epi16(in3, final_rounding);
michael@0 548 in4 = _mm_adds_epi16(in4, final_rounding);
michael@0 549 in5 = _mm_adds_epi16(in5, final_rounding);
michael@0 550 in6 = _mm_adds_epi16(in6, final_rounding);
michael@0 551 in7 = _mm_adds_epi16(in7, final_rounding);
michael@0 552
michael@0 553 in0 = _mm_srai_epi16(in0, 5);
michael@0 554 in1 = _mm_srai_epi16(in1, 5);
michael@0 555 in2 = _mm_srai_epi16(in2, 5);
michael@0 556 in3 = _mm_srai_epi16(in3, 5);
michael@0 557 in4 = _mm_srai_epi16(in4, 5);
michael@0 558 in5 = _mm_srai_epi16(in5, 5);
michael@0 559 in6 = _mm_srai_epi16(in6, 5);
michael@0 560 in7 = _mm_srai_epi16(in7, 5);
michael@0 561
michael@0 562 RECON_AND_STORE(dest, in0);
michael@0 563 RECON_AND_STORE(dest, in1);
michael@0 564 RECON_AND_STORE(dest, in2);
michael@0 565 RECON_AND_STORE(dest, in3);
michael@0 566 RECON_AND_STORE(dest, in4);
michael@0 567 RECON_AND_STORE(dest, in5);
michael@0 568 RECON_AND_STORE(dest, in6);
michael@0 569 RECON_AND_STORE(dest, in7);
michael@0 570 }
michael@0 571
michael@0 572 void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
michael@0 573 __m128i dc_value;
michael@0 574 const __m128i zero = _mm_setzero_si128();
michael@0 575 int a;
michael@0 576
michael@0 577 a = dct_const_round_shift(input[0] * cospi_16_64);
michael@0 578 a = dct_const_round_shift(a * cospi_16_64);
michael@0 579 a = ROUND_POWER_OF_TWO(a, 5);
michael@0 580
michael@0 581 dc_value = _mm_set1_epi16(a);
michael@0 582
michael@0 583 RECON_AND_STORE(dest, dc_value);
michael@0 584 RECON_AND_STORE(dest, dc_value);
michael@0 585 RECON_AND_STORE(dest, dc_value);
michael@0 586 RECON_AND_STORE(dest, dc_value);
michael@0 587 RECON_AND_STORE(dest, dc_value);
michael@0 588 RECON_AND_STORE(dest, dc_value);
michael@0 589 RECON_AND_STORE(dest, dc_value);
michael@0 590 RECON_AND_STORE(dest, dc_value);
michael@0 591 }
michael@0 592
michael@0 593 // perform 8x8 transpose
michael@0 594 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
michael@0 595 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
michael@0 596 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
michael@0 597 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
michael@0 598 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
michael@0 599 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
michael@0 600 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
michael@0 601 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
michael@0 602 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
michael@0 603
michael@0 604 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
michael@0 605 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
michael@0 606 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
michael@0 607 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
michael@0 608 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
michael@0 609 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
michael@0 610 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
michael@0 611 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
michael@0 612
michael@0 613 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
michael@0 614 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
michael@0 615 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
michael@0 616 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
michael@0 617 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
michael@0 618 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
michael@0 619 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
michael@0 620 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
michael@0 621 }
michael@0 622
michael@0 623 static void idct8_1d_sse2(__m128i *in) {
michael@0 624 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 625 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
michael@0 626 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
michael@0 627 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
michael@0 628 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
michael@0 629 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
michael@0 630 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 631 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
michael@0 632 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
michael@0 633
michael@0 634 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
michael@0 635 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
michael@0 636 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
michael@0 637 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
michael@0 638
michael@0 639 in0 = in[0];
michael@0 640 in1 = in[1];
michael@0 641 in2 = in[2];
michael@0 642 in3 = in[3];
michael@0 643 in4 = in[4];
michael@0 644 in5 = in[5];
michael@0 645 in6 = in[6];
michael@0 646 in7 = in[7];
michael@0 647
michael@0 648 // 8x8 Transpose is copied from vp9_fdct8x8_sse2()
michael@0 649 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
michael@0 650 in4, in5, in6, in7);
michael@0 651
michael@0 652 // 4-stage 1D idct8x8
michael@0 653 IDCT8_1D
michael@0 654 in[0] = in0;
michael@0 655 in[1] = in1;
michael@0 656 in[2] = in2;
michael@0 657 in[3] = in3;
michael@0 658 in[4] = in4;
michael@0 659 in[5] = in5;
michael@0 660 in[6] = in6;
michael@0 661 in[7] = in7;
michael@0 662 }
michael@0 663
michael@0 664 static void iadst8_1d_sse2(__m128i *in) {
michael@0 665 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
michael@0 666 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
michael@0 667 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
michael@0 668 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
michael@0 669 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
michael@0 670 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
michael@0 671 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
michael@0 672 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
michael@0 673 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
michael@0 674 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
michael@0 675 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
michael@0 676 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 677 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
michael@0 678 const __m128i k__const_0 = _mm_set1_epi16(0);
michael@0 679 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 680
michael@0 681 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
michael@0 682 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
michael@0 683 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
michael@0 684 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
michael@0 685 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
michael@0 686
michael@0 687 // transpose
michael@0 688 array_transpose_8x8(in, in);
michael@0 689
michael@0 690 // properly aligned for butterfly input
michael@0 691 in0 = in[7];
michael@0 692 in1 = in[0];
michael@0 693 in2 = in[5];
michael@0 694 in3 = in[2];
michael@0 695 in4 = in[3];
michael@0 696 in5 = in[4];
michael@0 697 in6 = in[1];
michael@0 698 in7 = in[6];
michael@0 699
michael@0 700 // column transformation
michael@0 701 // stage 1
michael@0 702 // interleave and multiply/add into 32-bit integer
michael@0 703 s0 = _mm_unpacklo_epi16(in0, in1);
michael@0 704 s1 = _mm_unpackhi_epi16(in0, in1);
michael@0 705 s2 = _mm_unpacklo_epi16(in2, in3);
michael@0 706 s3 = _mm_unpackhi_epi16(in2, in3);
michael@0 707 s4 = _mm_unpacklo_epi16(in4, in5);
michael@0 708 s5 = _mm_unpackhi_epi16(in4, in5);
michael@0 709 s6 = _mm_unpacklo_epi16(in6, in7);
michael@0 710 s7 = _mm_unpackhi_epi16(in6, in7);
michael@0 711
michael@0 712 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
michael@0 713 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
michael@0 714 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
michael@0 715 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
michael@0 716 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
michael@0 717 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
michael@0 718 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
michael@0 719 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
michael@0 720 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
michael@0 721 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
michael@0 722 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
michael@0 723 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
michael@0 724 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
michael@0 725 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
michael@0 726 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
michael@0 727 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
michael@0 728
michael@0 729 // addition
michael@0 730 w0 = _mm_add_epi32(u0, u8);
michael@0 731 w1 = _mm_add_epi32(u1, u9);
michael@0 732 w2 = _mm_add_epi32(u2, u10);
michael@0 733 w3 = _mm_add_epi32(u3, u11);
michael@0 734 w4 = _mm_add_epi32(u4, u12);
michael@0 735 w5 = _mm_add_epi32(u5, u13);
michael@0 736 w6 = _mm_add_epi32(u6, u14);
michael@0 737 w7 = _mm_add_epi32(u7, u15);
michael@0 738 w8 = _mm_sub_epi32(u0, u8);
michael@0 739 w9 = _mm_sub_epi32(u1, u9);
michael@0 740 w10 = _mm_sub_epi32(u2, u10);
michael@0 741 w11 = _mm_sub_epi32(u3, u11);
michael@0 742 w12 = _mm_sub_epi32(u4, u12);
michael@0 743 w13 = _mm_sub_epi32(u5, u13);
michael@0 744 w14 = _mm_sub_epi32(u6, u14);
michael@0 745 w15 = _mm_sub_epi32(u7, u15);
michael@0 746
michael@0 747 // shift and rounding
michael@0 748 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
michael@0 749 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
michael@0 750 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
michael@0 751 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
michael@0 752 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
michael@0 753 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
michael@0 754 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
michael@0 755 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
michael@0 756 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
michael@0 757 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
michael@0 758 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
michael@0 759 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
michael@0 760 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
michael@0 761 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
michael@0 762 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
michael@0 763 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
michael@0 764
michael@0 765 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
michael@0 766 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
michael@0 767 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
michael@0 768 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
michael@0 769 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
michael@0 770 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
michael@0 771 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
michael@0 772 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
michael@0 773 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
michael@0 774 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
michael@0 775 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
michael@0 776 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
michael@0 777 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
michael@0 778 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
michael@0 779 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
michael@0 780 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
michael@0 781
michael@0 782 // back to 16-bit and pack 8 integers into __m128i
michael@0 783 in[0] = _mm_packs_epi32(u0, u1);
michael@0 784 in[1] = _mm_packs_epi32(u2, u3);
michael@0 785 in[2] = _mm_packs_epi32(u4, u5);
michael@0 786 in[3] = _mm_packs_epi32(u6, u7);
michael@0 787 in[4] = _mm_packs_epi32(u8, u9);
michael@0 788 in[5] = _mm_packs_epi32(u10, u11);
michael@0 789 in[6] = _mm_packs_epi32(u12, u13);
michael@0 790 in[7] = _mm_packs_epi32(u14, u15);
michael@0 791
michael@0 792 // stage 2
michael@0 793 s0 = _mm_add_epi16(in[0], in[2]);
michael@0 794 s1 = _mm_add_epi16(in[1], in[3]);
michael@0 795 s2 = _mm_sub_epi16(in[0], in[2]);
michael@0 796 s3 = _mm_sub_epi16(in[1], in[3]);
michael@0 797 u0 = _mm_unpacklo_epi16(in[4], in[5]);
michael@0 798 u1 = _mm_unpackhi_epi16(in[4], in[5]);
michael@0 799 u2 = _mm_unpacklo_epi16(in[6], in[7]);
michael@0 800 u3 = _mm_unpackhi_epi16(in[6], in[7]);
michael@0 801
michael@0 802 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
michael@0 803 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
michael@0 804 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
michael@0 805 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
michael@0 806 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
michael@0 807 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
michael@0 808 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
michael@0 809 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
michael@0 810
michael@0 811 w0 = _mm_add_epi32(v0, v4);
michael@0 812 w1 = _mm_add_epi32(v1, v5);
michael@0 813 w2 = _mm_add_epi32(v2, v6);
michael@0 814 w3 = _mm_add_epi32(v3, v7);
michael@0 815 w4 = _mm_sub_epi32(v0, v4);
michael@0 816 w5 = _mm_sub_epi32(v1, v5);
michael@0 817 w6 = _mm_sub_epi32(v2, v6);
michael@0 818 w7 = _mm_sub_epi32(v3, v7);
michael@0 819
michael@0 820 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
michael@0 821 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
michael@0 822 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
michael@0 823 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
michael@0 824 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
michael@0 825 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
michael@0 826 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
michael@0 827 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
michael@0 828
michael@0 829 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
michael@0 830 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
michael@0 831 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
michael@0 832 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
michael@0 833 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
michael@0 834 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
michael@0 835 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
michael@0 836 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
michael@0 837
michael@0 838 // back to 16-bit intergers
michael@0 839 s4 = _mm_packs_epi32(u0, u1);
michael@0 840 s5 = _mm_packs_epi32(u2, u3);
michael@0 841 s6 = _mm_packs_epi32(u4, u5);
michael@0 842 s7 = _mm_packs_epi32(u6, u7);
michael@0 843
michael@0 844 // stage 3
michael@0 845 u0 = _mm_unpacklo_epi16(s2, s3);
michael@0 846 u1 = _mm_unpackhi_epi16(s2, s3);
michael@0 847 u2 = _mm_unpacklo_epi16(s6, s7);
michael@0 848 u3 = _mm_unpackhi_epi16(s6, s7);
michael@0 849
michael@0 850 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
michael@0 851 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
michael@0 852 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
michael@0 853 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
michael@0 854 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
michael@0 855 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
michael@0 856 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
michael@0 857 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
michael@0 858
michael@0 859 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
michael@0 860 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
michael@0 861 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
michael@0 862 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
michael@0 863 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
michael@0 864 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
michael@0 865 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
michael@0 866 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
michael@0 867
michael@0 868 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
michael@0 869 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
michael@0 870 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
michael@0 871 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
michael@0 872 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
michael@0 873 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
michael@0 874 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
michael@0 875 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
michael@0 876
michael@0 877 s2 = _mm_packs_epi32(v0, v1);
michael@0 878 s3 = _mm_packs_epi32(v2, v3);
michael@0 879 s6 = _mm_packs_epi32(v4, v5);
michael@0 880 s7 = _mm_packs_epi32(v6, v7);
michael@0 881
michael@0 882 in[0] = s0;
michael@0 883 in[1] = _mm_sub_epi16(k__const_0, s4);
michael@0 884 in[2] = s6;
michael@0 885 in[3] = _mm_sub_epi16(k__const_0, s2);
michael@0 886 in[4] = s3;
michael@0 887 in[5] = _mm_sub_epi16(k__const_0, s7);
michael@0 888 in[6] = s5;
michael@0 889 in[7] = _mm_sub_epi16(k__const_0, s1);
michael@0 890 }
michael@0 891
michael@0 892
michael@0 893 void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride,
michael@0 894 int tx_type) {
michael@0 895 __m128i in[8];
michael@0 896 const __m128i zero = _mm_setzero_si128();
michael@0 897 const __m128i final_rounding = _mm_set1_epi16(1<<4);
michael@0 898
michael@0 899 // load input data
michael@0 900 in[0] = _mm_load_si128((const __m128i *)input);
michael@0 901 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1));
michael@0 902 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2));
michael@0 903 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3));
michael@0 904 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4));
michael@0 905 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5));
michael@0 906 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6));
michael@0 907 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7));
michael@0 908
michael@0 909 switch (tx_type) {
michael@0 910 case 0: // DCT_DCT
michael@0 911 idct8_1d_sse2(in);
michael@0 912 idct8_1d_sse2(in);
michael@0 913 break;
michael@0 914 case 1: // ADST_DCT
michael@0 915 idct8_1d_sse2(in);
michael@0 916 iadst8_1d_sse2(in);
michael@0 917 break;
michael@0 918 case 2: // DCT_ADST
michael@0 919 iadst8_1d_sse2(in);
michael@0 920 idct8_1d_sse2(in);
michael@0 921 break;
michael@0 922 case 3: // ADST_ADST
michael@0 923 iadst8_1d_sse2(in);
michael@0 924 iadst8_1d_sse2(in);
michael@0 925 break;
michael@0 926 default:
michael@0 927 assert(0);
michael@0 928 break;
michael@0 929 }
michael@0 930
michael@0 931 // Final rounding and shift
michael@0 932 in[0] = _mm_adds_epi16(in[0], final_rounding);
michael@0 933 in[1] = _mm_adds_epi16(in[1], final_rounding);
michael@0 934 in[2] = _mm_adds_epi16(in[2], final_rounding);
michael@0 935 in[3] = _mm_adds_epi16(in[3], final_rounding);
michael@0 936 in[4] = _mm_adds_epi16(in[4], final_rounding);
michael@0 937 in[5] = _mm_adds_epi16(in[5], final_rounding);
michael@0 938 in[6] = _mm_adds_epi16(in[6], final_rounding);
michael@0 939 in[7] = _mm_adds_epi16(in[7], final_rounding);
michael@0 940
michael@0 941 in[0] = _mm_srai_epi16(in[0], 5);
michael@0 942 in[1] = _mm_srai_epi16(in[1], 5);
michael@0 943 in[2] = _mm_srai_epi16(in[2], 5);
michael@0 944 in[3] = _mm_srai_epi16(in[3], 5);
michael@0 945 in[4] = _mm_srai_epi16(in[4], 5);
michael@0 946 in[5] = _mm_srai_epi16(in[5], 5);
michael@0 947 in[6] = _mm_srai_epi16(in[6], 5);
michael@0 948 in[7] = _mm_srai_epi16(in[7], 5);
michael@0 949
michael@0 950 RECON_AND_STORE(dest, in[0]);
michael@0 951 RECON_AND_STORE(dest, in[1]);
michael@0 952 RECON_AND_STORE(dest, in[2]);
michael@0 953 RECON_AND_STORE(dest, in[3]);
michael@0 954 RECON_AND_STORE(dest, in[4]);
michael@0 955 RECON_AND_STORE(dest, in[5]);
michael@0 956 RECON_AND_STORE(dest, in[6]);
michael@0 957 RECON_AND_STORE(dest, in[7]);
michael@0 958 }
michael@0 959
michael@0 960 void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
michael@0 961 const __m128i zero = _mm_setzero_si128();
michael@0 962 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 963 const __m128i final_rounding = _mm_set1_epi16(1<<4);
michael@0 964 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
michael@0 965 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
michael@0 966 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
michael@0 967 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
michael@0 968 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
michael@0 969 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 970 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
michael@0 971 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
michael@0 972 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
michael@0 973
michael@0 974 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
michael@0 975 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
michael@0 976 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
michael@0 977 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
michael@0 978
michael@0 979 // Rows. Load 4-row input data.
michael@0 980 in0 = _mm_load_si128((const __m128i *)input);
michael@0 981 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1));
michael@0 982 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2));
michael@0 983 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3));
michael@0 984
michael@0 985 // 8x4 Transpose
michael@0 986 TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
michael@0 987
michael@0 988 // Stage1
michael@0 989 { //NOLINT
michael@0 990 const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3);
michael@0 991 const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2);
michael@0 992
michael@0 993 tmp0 = _mm_madd_epi16(lo_17, stg1_0);
michael@0 994 tmp2 = _mm_madd_epi16(lo_17, stg1_1);
michael@0 995 tmp4 = _mm_madd_epi16(lo_35, stg1_2);
michael@0 996 tmp6 = _mm_madd_epi16(lo_35, stg1_3);
michael@0 997
michael@0 998 tmp0 = _mm_add_epi32(tmp0, rounding);
michael@0 999 tmp2 = _mm_add_epi32(tmp2, rounding);
michael@0 1000 tmp4 = _mm_add_epi32(tmp4, rounding);
michael@0 1001 tmp6 = _mm_add_epi32(tmp6, rounding);
michael@0 1002 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
michael@0 1003 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
michael@0 1004 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
michael@0 1005 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
michael@0 1006
michael@0 1007 stp1_4 = _mm_packs_epi32(tmp0, zero);
michael@0 1008 stp1_7 = _mm_packs_epi32(tmp2, zero);
michael@0 1009 stp1_5 = _mm_packs_epi32(tmp4, zero);
michael@0 1010 stp1_6 = _mm_packs_epi32(tmp6, zero);
michael@0 1011 }
michael@0 1012
michael@0 1013 // Stage2
michael@0 1014 { //NOLINT
michael@0 1015 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2);
michael@0 1016 const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3);
michael@0 1017
michael@0 1018 tmp0 = _mm_madd_epi16(lo_04, stg2_0);
michael@0 1019 tmp2 = _mm_madd_epi16(lo_04, stg2_1);
michael@0 1020 tmp4 = _mm_madd_epi16(lo_26, stg2_2);
michael@0 1021 tmp6 = _mm_madd_epi16(lo_26, stg2_3);
michael@0 1022
michael@0 1023 tmp0 = _mm_add_epi32(tmp0, rounding);
michael@0 1024 tmp2 = _mm_add_epi32(tmp2, rounding);
michael@0 1025 tmp4 = _mm_add_epi32(tmp4, rounding);
michael@0 1026 tmp6 = _mm_add_epi32(tmp6, rounding);
michael@0 1027 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
michael@0 1028 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
michael@0 1029 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
michael@0 1030 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
michael@0 1031
michael@0 1032 stp2_0 = _mm_packs_epi32(tmp0, zero);
michael@0 1033 stp2_1 = _mm_packs_epi32(tmp2, zero);
michael@0 1034 stp2_2 = _mm_packs_epi32(tmp4, zero);
michael@0 1035 stp2_3 = _mm_packs_epi32(tmp6, zero);
michael@0 1036
michael@0 1037 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);
michael@0 1038 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);
michael@0 1039 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);
michael@0 1040 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);
michael@0 1041 }
michael@0 1042
michael@0 1043 // Stage3
michael@0 1044 { //NOLINT
michael@0 1045 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
michael@0 1046 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);
michael@0 1047 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);
michael@0 1048 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);
michael@0 1049 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);
michael@0 1050
michael@0 1051 tmp0 = _mm_madd_epi16(lo_56, stg3_0);
michael@0 1052 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
michael@0 1053
michael@0 1054 tmp0 = _mm_add_epi32(tmp0, rounding);
michael@0 1055 tmp2 = _mm_add_epi32(tmp2, rounding);
michael@0 1056 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
michael@0 1057 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
michael@0 1058
michael@0 1059 stp1_5 = _mm_packs_epi32(tmp0, zero);
michael@0 1060 stp1_6 = _mm_packs_epi32(tmp2, zero);
michael@0 1061 }
michael@0 1062
michael@0 1063 // Stage4
michael@0 1064 in0 = _mm_adds_epi16(stp1_0, stp2_7);
michael@0 1065 in1 = _mm_adds_epi16(stp1_1, stp1_6);
michael@0 1066 in2 = _mm_adds_epi16(stp1_2, stp1_5);
michael@0 1067 in3 = _mm_adds_epi16(stp1_3, stp2_4);
michael@0 1068 in4 = _mm_subs_epi16(stp1_3, stp2_4);
michael@0 1069 in5 = _mm_subs_epi16(stp1_2, stp1_5);
michael@0 1070 in6 = _mm_subs_epi16(stp1_1, stp1_6);
michael@0 1071 in7 = _mm_subs_epi16(stp1_0, stp2_7);
michael@0 1072
michael@0 1073 // Columns. 4x8 Transpose
michael@0 1074 TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
michael@0 1075 in4, in5, in6, in7)
michael@0 1076
michael@0 1077 // 1D idct8x8
michael@0 1078 IDCT8_1D
michael@0 1079
michael@0 1080 // Final rounding and shift
michael@0 1081 in0 = _mm_adds_epi16(in0, final_rounding);
michael@0 1082 in1 = _mm_adds_epi16(in1, final_rounding);
michael@0 1083 in2 = _mm_adds_epi16(in2, final_rounding);
michael@0 1084 in3 = _mm_adds_epi16(in3, final_rounding);
michael@0 1085 in4 = _mm_adds_epi16(in4, final_rounding);
michael@0 1086 in5 = _mm_adds_epi16(in5, final_rounding);
michael@0 1087 in6 = _mm_adds_epi16(in6, final_rounding);
michael@0 1088 in7 = _mm_adds_epi16(in7, final_rounding);
michael@0 1089
michael@0 1090 in0 = _mm_srai_epi16(in0, 5);
michael@0 1091 in1 = _mm_srai_epi16(in1, 5);
michael@0 1092 in2 = _mm_srai_epi16(in2, 5);
michael@0 1093 in3 = _mm_srai_epi16(in3, 5);
michael@0 1094 in4 = _mm_srai_epi16(in4, 5);
michael@0 1095 in5 = _mm_srai_epi16(in5, 5);
michael@0 1096 in6 = _mm_srai_epi16(in6, 5);
michael@0 1097 in7 = _mm_srai_epi16(in7, 5);
michael@0 1098
michael@0 1099 RECON_AND_STORE(dest, in0);
michael@0 1100 RECON_AND_STORE(dest, in1);
michael@0 1101 RECON_AND_STORE(dest, in2);
michael@0 1102 RECON_AND_STORE(dest, in3);
michael@0 1103 RECON_AND_STORE(dest, in4);
michael@0 1104 RECON_AND_STORE(dest, in5);
michael@0 1105 RECON_AND_STORE(dest, in6);
michael@0 1106 RECON_AND_STORE(dest, in7);
michael@0 1107 }
michael@0 1108
michael@0 1109 #define IDCT16_1D \
michael@0 1110 /* Stage2 */ \
michael@0 1111 { \
michael@0 1112 const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
michael@0 1113 const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \
michael@0 1114 const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \
michael@0 1115 const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \
michael@0 1116 const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \
michael@0 1117 const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \
michael@0 1118 const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \
michael@0 1119 const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \
michael@0 1120 \
michael@0 1121 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
michael@0 1122 stg2_0, stg2_1, stg2_2, stg2_3, \
michael@0 1123 stp2_8, stp2_15, stp2_9, stp2_14) \
michael@0 1124 \
michael@0 1125 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
michael@0 1126 stg2_4, stg2_5, stg2_6, stg2_7, \
michael@0 1127 stp2_10, stp2_13, stp2_11, stp2_12) \
michael@0 1128 } \
michael@0 1129 \
michael@0 1130 /* Stage3 */ \
michael@0 1131 { \
michael@0 1132 const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \
michael@0 1133 const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \
michael@0 1134 const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \
michael@0 1135 const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \
michael@0 1136 \
michael@0 1137 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
michael@0 1138 stg3_0, stg3_1, stg3_2, stg3_3, \
michael@0 1139 stp1_4, stp1_7, stp1_5, stp1_6) \
michael@0 1140 \
michael@0 1141 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
michael@0 1142 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
michael@0 1143 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
michael@0 1144 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
michael@0 1145 \
michael@0 1146 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
michael@0 1147 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
michael@0 1148 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
michael@0 1149 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
michael@0 1150 } \
michael@0 1151 \
michael@0 1152 /* Stage4 */ \
michael@0 1153 { \
michael@0 1154 const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \
michael@0 1155 const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \
michael@0 1156 const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \
michael@0 1157 const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \
michael@0 1158 \
michael@0 1159 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
michael@0 1160 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
michael@0 1161 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
michael@0 1162 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
michael@0 1163 \
michael@0 1164 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
michael@0 1165 stg4_0, stg4_1, stg4_2, stg4_3, \
michael@0 1166 stp2_0, stp2_1, stp2_2, stp2_3) \
michael@0 1167 \
michael@0 1168 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
michael@0 1169 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
michael@0 1170 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
michael@0 1171 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
michael@0 1172 \
michael@0 1173 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
michael@0 1174 stg4_4, stg4_5, stg4_6, stg4_7, \
michael@0 1175 stp2_9, stp2_14, stp2_10, stp2_13) \
michael@0 1176 } \
michael@0 1177 \
michael@0 1178 /* Stage5 */ \
michael@0 1179 { \
michael@0 1180 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
michael@0 1181 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
michael@0 1182 \
michael@0 1183 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
michael@0 1184 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
michael@0 1185 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
michael@0 1186 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
michael@0 1187 \
michael@0 1188 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
michael@0 1189 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
michael@0 1190 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
michael@0 1191 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
michael@0 1192 \
michael@0 1193 tmp0 = _mm_add_epi32(tmp0, rounding); \
michael@0 1194 tmp1 = _mm_add_epi32(tmp1, rounding); \
michael@0 1195 tmp2 = _mm_add_epi32(tmp2, rounding); \
michael@0 1196 tmp3 = _mm_add_epi32(tmp3, rounding); \
michael@0 1197 \
michael@0 1198 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
michael@0 1199 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
michael@0 1200 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
michael@0 1201 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
michael@0 1202 \
michael@0 1203 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
michael@0 1204 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
michael@0 1205 \
michael@0 1206 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
michael@0 1207 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
michael@0 1208 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
michael@0 1209 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
michael@0 1210 \
michael@0 1211 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
michael@0 1212 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
michael@0 1213 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
michael@0 1214 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
michael@0 1215 } \
michael@0 1216 \
michael@0 1217 /* Stage6 */ \
michael@0 1218 { \
michael@0 1219 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
michael@0 1220 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
michael@0 1221 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
michael@0 1222 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
michael@0 1223 \
michael@0 1224 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
michael@0 1225 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
michael@0 1226 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
michael@0 1227 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
michael@0 1228 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
michael@0 1229 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
michael@0 1230 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
michael@0 1231 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
michael@0 1232 \
michael@0 1233 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
michael@0 1234 stg6_0, stg4_0, stg6_0, stg4_0, \
michael@0 1235 stp2_10, stp2_13, stp2_11, stp2_12) \
michael@0 1236 }
michael@0 1237
michael@0 1238 void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest,
michael@0 1239 int stride) {
michael@0 1240 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 1241 const __m128i final_rounding = _mm_set1_epi16(1<<5);
michael@0 1242 const __m128i zero = _mm_setzero_si128();
michael@0 1243
michael@0 1244 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
michael@0 1245 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
michael@0 1246 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
michael@0 1247 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
michael@0 1248 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
michael@0 1249 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
michael@0 1250 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
michael@0 1251 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
michael@0 1252
michael@0 1253 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
michael@0 1254 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
michael@0 1255 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
michael@0 1256 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
michael@0 1257
michael@0 1258 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
michael@0 1259 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 1260 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
michael@0 1261 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
michael@0 1262 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
michael@0 1263 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
michael@0 1264 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
michael@0 1265 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
michael@0 1266
michael@0 1267 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
michael@0 1268
michael@0 1269 __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
michael@0 1270 in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
michael@0 1271 in10 = zero, in11 = zero, in12 = zero, in13 = zero,
michael@0 1272 in14 = zero, in15 = zero;
michael@0 1273 __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
michael@0 1274 l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
michael@0 1275 l12 = zero, l13 = zero, l14 = zero, l15 = zero;
michael@0 1276 __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero,
michael@0 1277 r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero,
michael@0 1278 r12 = zero, r13 = zero, r14 = zero, r15 = zero;
michael@0 1279 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
michael@0 1280 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
michael@0 1281 stp1_8_0, stp1_12_0;
michael@0 1282 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
michael@0 1283 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
michael@0 1284 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
michael@0 1285 int i;
michael@0 1286
michael@0 1287 // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct.
michael@0 1288 for (i = 0; i < 4; i++) {
michael@0 1289 // 1-D idct
michael@0 1290 if (i < 2) {
michael@0 1291 if (i == 1) input += 128;
michael@0 1292
michael@0 1293 // Load input data.
michael@0 1294 in0 = _mm_load_si128((const __m128i *)input);
michael@0 1295 in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
michael@0 1296 in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
michael@0 1297 in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
michael@0 1298 in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
michael@0 1299 in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
michael@0 1300 in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
michael@0 1301 in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
michael@0 1302 in4 = _mm_load_si128((const __m128i *)(input + 8 * 8));
michael@0 1303 in12 = _mm_load_si128((const __m128i *)(input + 8 * 9));
michael@0 1304 in5 = _mm_load_si128((const __m128i *)(input + 8 * 10));
michael@0 1305 in13 = _mm_load_si128((const __m128i *)(input + 8 * 11));
michael@0 1306 in6 = _mm_load_si128((const __m128i *)(input + 8 * 12));
michael@0 1307 in14 = _mm_load_si128((const __m128i *)(input + 8 * 13));
michael@0 1308 in7 = _mm_load_si128((const __m128i *)(input + 8 * 14));
michael@0 1309 in15 = _mm_load_si128((const __m128i *)(input + 8 * 15));
michael@0 1310
michael@0 1311 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
michael@0 1312 in4, in5, in6, in7);
michael@0 1313 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
michael@0 1314 in10, in11, in12, in13, in14, in15);
michael@0 1315 }
michael@0 1316
michael@0 1317 if (i == 2) {
michael@0 1318 TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
michael@0 1319 in5, in6, in7);
michael@0 1320 TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12,
michael@0 1321 in13, in14, in15);
michael@0 1322 }
michael@0 1323
michael@0 1324 if (i == 3) {
michael@0 1325 TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
michael@0 1326 in4, in5, in6, in7);
michael@0 1327 TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11,
michael@0 1328 in12, in13, in14, in15);
michael@0 1329 }
michael@0 1330
michael@0 1331 IDCT16_1D
michael@0 1332
michael@0 1333 // Stage7
michael@0 1334 if (i == 0) {
michael@0 1335 // Left 8x16
michael@0 1336 l0 = _mm_add_epi16(stp2_0, stp1_15);
michael@0 1337 l1 = _mm_add_epi16(stp2_1, stp1_14);
michael@0 1338 l2 = _mm_add_epi16(stp2_2, stp2_13);
michael@0 1339 l3 = _mm_add_epi16(stp2_3, stp2_12);
michael@0 1340 l4 = _mm_add_epi16(stp2_4, stp2_11);
michael@0 1341 l5 = _mm_add_epi16(stp2_5, stp2_10);
michael@0 1342 l6 = _mm_add_epi16(stp2_6, stp1_9);
michael@0 1343 l7 = _mm_add_epi16(stp2_7, stp1_8);
michael@0 1344 l8 = _mm_sub_epi16(stp2_7, stp1_8);
michael@0 1345 l9 = _mm_sub_epi16(stp2_6, stp1_9);
michael@0 1346 l10 = _mm_sub_epi16(stp2_5, stp2_10);
michael@0 1347 l11 = _mm_sub_epi16(stp2_4, stp2_11);
michael@0 1348 l12 = _mm_sub_epi16(stp2_3, stp2_12);
michael@0 1349 l13 = _mm_sub_epi16(stp2_2, stp2_13);
michael@0 1350 l14 = _mm_sub_epi16(stp2_1, stp1_14);
michael@0 1351 l15 = _mm_sub_epi16(stp2_0, stp1_15);
michael@0 1352 } else if (i == 1) {
michael@0 1353 // Right 8x16
michael@0 1354 r0 = _mm_add_epi16(stp2_0, stp1_15);
michael@0 1355 r1 = _mm_add_epi16(stp2_1, stp1_14);
michael@0 1356 r2 = _mm_add_epi16(stp2_2, stp2_13);
michael@0 1357 r3 = _mm_add_epi16(stp2_3, stp2_12);
michael@0 1358 r4 = _mm_add_epi16(stp2_4, stp2_11);
michael@0 1359 r5 = _mm_add_epi16(stp2_5, stp2_10);
michael@0 1360 r6 = _mm_add_epi16(stp2_6, stp1_9);
michael@0 1361 r7 = _mm_add_epi16(stp2_7, stp1_8);
michael@0 1362 r8 = _mm_sub_epi16(stp2_7, stp1_8);
michael@0 1363 r9 = _mm_sub_epi16(stp2_6, stp1_9);
michael@0 1364 r10 = _mm_sub_epi16(stp2_5, stp2_10);
michael@0 1365 r11 = _mm_sub_epi16(stp2_4, stp2_11);
michael@0 1366 r12 = _mm_sub_epi16(stp2_3, stp2_12);
michael@0 1367 r13 = _mm_sub_epi16(stp2_2, stp2_13);
michael@0 1368 r14 = _mm_sub_epi16(stp2_1, stp1_14);
michael@0 1369 r15 = _mm_sub_epi16(stp2_0, stp1_15);
michael@0 1370 } else {
michael@0 1371 // 2-D
michael@0 1372 in0 = _mm_add_epi16(stp2_0, stp1_15);
michael@0 1373 in1 = _mm_add_epi16(stp2_1, stp1_14);
michael@0 1374 in2 = _mm_add_epi16(stp2_2, stp2_13);
michael@0 1375 in3 = _mm_add_epi16(stp2_3, stp2_12);
michael@0 1376 in4 = _mm_add_epi16(stp2_4, stp2_11);
michael@0 1377 in5 = _mm_add_epi16(stp2_5, stp2_10);
michael@0 1378 in6 = _mm_add_epi16(stp2_6, stp1_9);
michael@0 1379 in7 = _mm_add_epi16(stp2_7, stp1_8);
michael@0 1380 in8 = _mm_sub_epi16(stp2_7, stp1_8);
michael@0 1381 in9 = _mm_sub_epi16(stp2_6, stp1_9);
michael@0 1382 in10 = _mm_sub_epi16(stp2_5, stp2_10);
michael@0 1383 in11 = _mm_sub_epi16(stp2_4, stp2_11);
michael@0 1384 in12 = _mm_sub_epi16(stp2_3, stp2_12);
michael@0 1385 in13 = _mm_sub_epi16(stp2_2, stp2_13);
michael@0 1386 in14 = _mm_sub_epi16(stp2_1, stp1_14);
michael@0 1387 in15 = _mm_sub_epi16(stp2_0, stp1_15);
michael@0 1388
michael@0 1389 // Final rounding and shift
michael@0 1390 in0 = _mm_adds_epi16(in0, final_rounding);
michael@0 1391 in1 = _mm_adds_epi16(in1, final_rounding);
michael@0 1392 in2 = _mm_adds_epi16(in2, final_rounding);
michael@0 1393 in3 = _mm_adds_epi16(in3, final_rounding);
michael@0 1394 in4 = _mm_adds_epi16(in4, final_rounding);
michael@0 1395 in5 = _mm_adds_epi16(in5, final_rounding);
michael@0 1396 in6 = _mm_adds_epi16(in6, final_rounding);
michael@0 1397 in7 = _mm_adds_epi16(in7, final_rounding);
michael@0 1398 in8 = _mm_adds_epi16(in8, final_rounding);
michael@0 1399 in9 = _mm_adds_epi16(in9, final_rounding);
michael@0 1400 in10 = _mm_adds_epi16(in10, final_rounding);
michael@0 1401 in11 = _mm_adds_epi16(in11, final_rounding);
michael@0 1402 in12 = _mm_adds_epi16(in12, final_rounding);
michael@0 1403 in13 = _mm_adds_epi16(in13, final_rounding);
michael@0 1404 in14 = _mm_adds_epi16(in14, final_rounding);
michael@0 1405 in15 = _mm_adds_epi16(in15, final_rounding);
michael@0 1406
michael@0 1407 in0 = _mm_srai_epi16(in0, 6);
michael@0 1408 in1 = _mm_srai_epi16(in1, 6);
michael@0 1409 in2 = _mm_srai_epi16(in2, 6);
michael@0 1410 in3 = _mm_srai_epi16(in3, 6);
michael@0 1411 in4 = _mm_srai_epi16(in4, 6);
michael@0 1412 in5 = _mm_srai_epi16(in5, 6);
michael@0 1413 in6 = _mm_srai_epi16(in6, 6);
michael@0 1414 in7 = _mm_srai_epi16(in7, 6);
michael@0 1415 in8 = _mm_srai_epi16(in8, 6);
michael@0 1416 in9 = _mm_srai_epi16(in9, 6);
michael@0 1417 in10 = _mm_srai_epi16(in10, 6);
michael@0 1418 in11 = _mm_srai_epi16(in11, 6);
michael@0 1419 in12 = _mm_srai_epi16(in12, 6);
michael@0 1420 in13 = _mm_srai_epi16(in13, 6);
michael@0 1421 in14 = _mm_srai_epi16(in14, 6);
michael@0 1422 in15 = _mm_srai_epi16(in15, 6);
michael@0 1423
michael@0 1424 RECON_AND_STORE(dest, in0);
michael@0 1425 RECON_AND_STORE(dest, in1);
michael@0 1426 RECON_AND_STORE(dest, in2);
michael@0 1427 RECON_AND_STORE(dest, in3);
michael@0 1428 RECON_AND_STORE(dest, in4);
michael@0 1429 RECON_AND_STORE(dest, in5);
michael@0 1430 RECON_AND_STORE(dest, in6);
michael@0 1431 RECON_AND_STORE(dest, in7);
michael@0 1432 RECON_AND_STORE(dest, in8);
michael@0 1433 RECON_AND_STORE(dest, in9);
michael@0 1434 RECON_AND_STORE(dest, in10);
michael@0 1435 RECON_AND_STORE(dest, in11);
michael@0 1436 RECON_AND_STORE(dest, in12);
michael@0 1437 RECON_AND_STORE(dest, in13);
michael@0 1438 RECON_AND_STORE(dest, in14);
michael@0 1439 RECON_AND_STORE(dest, in15);
michael@0 1440
michael@0 1441 dest += 8 - (stride * 16);
michael@0 1442 }
michael@0 1443 }
michael@0 1444 }
michael@0 1445
michael@0 1446 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
michael@0 1447 __m128i dc_value;
michael@0 1448 const __m128i zero = _mm_setzero_si128();
michael@0 1449 int a, i;
michael@0 1450
michael@0 1451 a = dct_const_round_shift(input[0] * cospi_16_64);
michael@0 1452 a = dct_const_round_shift(a * cospi_16_64);
michael@0 1453 a = ROUND_POWER_OF_TWO(a, 6);
michael@0 1454
michael@0 1455 dc_value = _mm_set1_epi16(a);
michael@0 1456
michael@0 1457 for (i = 0; i < 2; ++i) {
michael@0 1458 RECON_AND_STORE(dest, dc_value);
michael@0 1459 RECON_AND_STORE(dest, dc_value);
michael@0 1460 RECON_AND_STORE(dest, dc_value);
michael@0 1461 RECON_AND_STORE(dest, dc_value);
michael@0 1462 RECON_AND_STORE(dest, dc_value);
michael@0 1463 RECON_AND_STORE(dest, dc_value);
michael@0 1464 RECON_AND_STORE(dest, dc_value);
michael@0 1465 RECON_AND_STORE(dest, dc_value);
michael@0 1466 RECON_AND_STORE(dest, dc_value);
michael@0 1467 RECON_AND_STORE(dest, dc_value);
michael@0 1468 RECON_AND_STORE(dest, dc_value);
michael@0 1469 RECON_AND_STORE(dest, dc_value);
michael@0 1470 RECON_AND_STORE(dest, dc_value);
michael@0 1471 RECON_AND_STORE(dest, dc_value);
michael@0 1472 RECON_AND_STORE(dest, dc_value);
michael@0 1473 RECON_AND_STORE(dest, dc_value);
michael@0 1474 dest += 8 - (stride * 16);
michael@0 1475 }
michael@0 1476 }
michael@0 1477
michael@0 1478 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
michael@0 1479 __m128i tbuf[8];
michael@0 1480 array_transpose_8x8(res0, res0);
michael@0 1481 array_transpose_8x8(res1, tbuf);
michael@0 1482 array_transpose_8x8(res0 + 8, res1);
michael@0 1483 array_transpose_8x8(res1 + 8, res1 + 8);
michael@0 1484
michael@0 1485 res0[8] = tbuf[0];
michael@0 1486 res0[9] = tbuf[1];
michael@0 1487 res0[10] = tbuf[2];
michael@0 1488 res0[11] = tbuf[3];
michael@0 1489 res0[12] = tbuf[4];
michael@0 1490 res0[13] = tbuf[5];
michael@0 1491 res0[14] = tbuf[6];
michael@0 1492 res0[15] = tbuf[7];
michael@0 1493 }
michael@0 1494
michael@0 1495 static void iadst16_1d_8col(__m128i *in) {
michael@0 1496 // perform 16x16 1-D ADST for 8 columns
michael@0 1497 __m128i s[16], x[16], u[32], v[32];
michael@0 1498 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
michael@0 1499 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
michael@0 1500 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
michael@0 1501 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
michael@0 1502 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
michael@0 1503 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
michael@0 1504 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
michael@0 1505 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
michael@0 1506 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
michael@0 1507 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
michael@0 1508 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
michael@0 1509 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
michael@0 1510 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
michael@0 1511 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
michael@0 1512 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
michael@0 1513 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
michael@0 1514 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
michael@0 1515 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
michael@0 1516 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
michael@0 1517 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
michael@0 1518 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
michael@0 1519 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
michael@0 1520 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
michael@0 1521 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
michael@0 1522 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
michael@0 1523 const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
michael@0 1524 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
michael@0 1525 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 1526 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
michael@0 1527 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 1528 const __m128i kZero = _mm_set1_epi16(0);
michael@0 1529
michael@0 1530 u[0] = _mm_unpacklo_epi16(in[15], in[0]);
michael@0 1531 u[1] = _mm_unpackhi_epi16(in[15], in[0]);
michael@0 1532 u[2] = _mm_unpacklo_epi16(in[13], in[2]);
michael@0 1533 u[3] = _mm_unpackhi_epi16(in[13], in[2]);
michael@0 1534 u[4] = _mm_unpacklo_epi16(in[11], in[4]);
michael@0 1535 u[5] = _mm_unpackhi_epi16(in[11], in[4]);
michael@0 1536 u[6] = _mm_unpacklo_epi16(in[9], in[6]);
michael@0 1537 u[7] = _mm_unpackhi_epi16(in[9], in[6]);
michael@0 1538 u[8] = _mm_unpacklo_epi16(in[7], in[8]);
michael@0 1539 u[9] = _mm_unpackhi_epi16(in[7], in[8]);
michael@0 1540 u[10] = _mm_unpacklo_epi16(in[5], in[10]);
michael@0 1541 u[11] = _mm_unpackhi_epi16(in[5], in[10]);
michael@0 1542 u[12] = _mm_unpacklo_epi16(in[3], in[12]);
michael@0 1543 u[13] = _mm_unpackhi_epi16(in[3], in[12]);
michael@0 1544 u[14] = _mm_unpacklo_epi16(in[1], in[14]);
michael@0 1545 u[15] = _mm_unpackhi_epi16(in[1], in[14]);
michael@0 1546
michael@0 1547 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
michael@0 1548 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
michael@0 1549 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
michael@0 1550 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
michael@0 1551 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
michael@0 1552 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
michael@0 1553 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
michael@0 1554 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
michael@0 1555 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
michael@0 1556 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
michael@0 1557 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
michael@0 1558 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
michael@0 1559 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
michael@0 1560 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
michael@0 1561 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
michael@0 1562 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
michael@0 1563 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
michael@0 1564 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
michael@0 1565 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
michael@0 1566 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
michael@0 1567 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
michael@0 1568 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
michael@0 1569 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
michael@0 1570 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
michael@0 1571 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
michael@0 1572 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
michael@0 1573 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
michael@0 1574 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
michael@0 1575 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
michael@0 1576 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
michael@0 1577 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
michael@0 1578 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
michael@0 1579
michael@0 1580 u[0] = _mm_add_epi32(v[0], v[16]);
michael@0 1581 u[1] = _mm_add_epi32(v[1], v[17]);
michael@0 1582 u[2] = _mm_add_epi32(v[2], v[18]);
michael@0 1583 u[3] = _mm_add_epi32(v[3], v[19]);
michael@0 1584 u[4] = _mm_add_epi32(v[4], v[20]);
michael@0 1585 u[5] = _mm_add_epi32(v[5], v[21]);
michael@0 1586 u[6] = _mm_add_epi32(v[6], v[22]);
michael@0 1587 u[7] = _mm_add_epi32(v[7], v[23]);
michael@0 1588 u[8] = _mm_add_epi32(v[8], v[24]);
michael@0 1589 u[9] = _mm_add_epi32(v[9], v[25]);
michael@0 1590 u[10] = _mm_add_epi32(v[10], v[26]);
michael@0 1591 u[11] = _mm_add_epi32(v[11], v[27]);
michael@0 1592 u[12] = _mm_add_epi32(v[12], v[28]);
michael@0 1593 u[13] = _mm_add_epi32(v[13], v[29]);
michael@0 1594 u[14] = _mm_add_epi32(v[14], v[30]);
michael@0 1595 u[15] = _mm_add_epi32(v[15], v[31]);
michael@0 1596 u[16] = _mm_sub_epi32(v[0], v[16]);
michael@0 1597 u[17] = _mm_sub_epi32(v[1], v[17]);
michael@0 1598 u[18] = _mm_sub_epi32(v[2], v[18]);
michael@0 1599 u[19] = _mm_sub_epi32(v[3], v[19]);
michael@0 1600 u[20] = _mm_sub_epi32(v[4], v[20]);
michael@0 1601 u[21] = _mm_sub_epi32(v[5], v[21]);
michael@0 1602 u[22] = _mm_sub_epi32(v[6], v[22]);
michael@0 1603 u[23] = _mm_sub_epi32(v[7], v[23]);
michael@0 1604 u[24] = _mm_sub_epi32(v[8], v[24]);
michael@0 1605 u[25] = _mm_sub_epi32(v[9], v[25]);
michael@0 1606 u[26] = _mm_sub_epi32(v[10], v[26]);
michael@0 1607 u[27] = _mm_sub_epi32(v[11], v[27]);
michael@0 1608 u[28] = _mm_sub_epi32(v[12], v[28]);
michael@0 1609 u[29] = _mm_sub_epi32(v[13], v[29]);
michael@0 1610 u[30] = _mm_sub_epi32(v[14], v[30]);
michael@0 1611 u[31] = _mm_sub_epi32(v[15], v[31]);
michael@0 1612
michael@0 1613 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
michael@0 1614 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
michael@0 1615 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
michael@0 1616 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
michael@0 1617 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
michael@0 1618 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
michael@0 1619 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
michael@0 1620 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
michael@0 1621 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
michael@0 1622 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
michael@0 1623 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
michael@0 1624 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
michael@0 1625 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
michael@0 1626 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
michael@0 1627 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
michael@0 1628 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
michael@0 1629 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
michael@0 1630 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
michael@0 1631 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
michael@0 1632 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
michael@0 1633 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
michael@0 1634 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
michael@0 1635 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
michael@0 1636 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
michael@0 1637 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
michael@0 1638 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
michael@0 1639 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
michael@0 1640 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
michael@0 1641 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
michael@0 1642 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
michael@0 1643 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
michael@0 1644 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
michael@0 1645
michael@0 1646 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
michael@0 1647 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
michael@0 1648 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
michael@0 1649 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
michael@0 1650 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
michael@0 1651 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
michael@0 1652 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
michael@0 1653 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
michael@0 1654 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
michael@0 1655 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
michael@0 1656 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
michael@0 1657 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
michael@0 1658 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
michael@0 1659 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
michael@0 1660 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
michael@0 1661 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
michael@0 1662 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
michael@0 1663 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
michael@0 1664 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
michael@0 1665 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
michael@0 1666 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
michael@0 1667 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
michael@0 1668 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
michael@0 1669 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
michael@0 1670 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
michael@0 1671 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
michael@0 1672 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
michael@0 1673 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
michael@0 1674 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
michael@0 1675 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
michael@0 1676 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
michael@0 1677 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
michael@0 1678
michael@0 1679 s[0] = _mm_packs_epi32(u[0], u[1]);
michael@0 1680 s[1] = _mm_packs_epi32(u[2], u[3]);
michael@0 1681 s[2] = _mm_packs_epi32(u[4], u[5]);
michael@0 1682 s[3] = _mm_packs_epi32(u[6], u[7]);
michael@0 1683 s[4] = _mm_packs_epi32(u[8], u[9]);
michael@0 1684 s[5] = _mm_packs_epi32(u[10], u[11]);
michael@0 1685 s[6] = _mm_packs_epi32(u[12], u[13]);
michael@0 1686 s[7] = _mm_packs_epi32(u[14], u[15]);
michael@0 1687 s[8] = _mm_packs_epi32(u[16], u[17]);
michael@0 1688 s[9] = _mm_packs_epi32(u[18], u[19]);
michael@0 1689 s[10] = _mm_packs_epi32(u[20], u[21]);
michael@0 1690 s[11] = _mm_packs_epi32(u[22], u[23]);
michael@0 1691 s[12] = _mm_packs_epi32(u[24], u[25]);
michael@0 1692 s[13] = _mm_packs_epi32(u[26], u[27]);
michael@0 1693 s[14] = _mm_packs_epi32(u[28], u[29]);
michael@0 1694 s[15] = _mm_packs_epi32(u[30], u[31]);
michael@0 1695
michael@0 1696 // stage 2
michael@0 1697 u[0] = _mm_unpacklo_epi16(s[8], s[9]);
michael@0 1698 u[1] = _mm_unpackhi_epi16(s[8], s[9]);
michael@0 1699 u[2] = _mm_unpacklo_epi16(s[10], s[11]);
michael@0 1700 u[3] = _mm_unpackhi_epi16(s[10], s[11]);
michael@0 1701 u[4] = _mm_unpacklo_epi16(s[12], s[13]);
michael@0 1702 u[5] = _mm_unpackhi_epi16(s[12], s[13]);
michael@0 1703 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
michael@0 1704 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
michael@0 1705
michael@0 1706 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
michael@0 1707 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
michael@0 1708 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
michael@0 1709 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
michael@0 1710 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
michael@0 1711 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
michael@0 1712 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
michael@0 1713 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
michael@0 1714 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
michael@0 1715 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
michael@0 1716 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
michael@0 1717 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
michael@0 1718 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
michael@0 1719 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
michael@0 1720 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
michael@0 1721 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
michael@0 1722
michael@0 1723 u[0] = _mm_add_epi32(v[0], v[8]);
michael@0 1724 u[1] = _mm_add_epi32(v[1], v[9]);
michael@0 1725 u[2] = _mm_add_epi32(v[2], v[10]);
michael@0 1726 u[3] = _mm_add_epi32(v[3], v[11]);
michael@0 1727 u[4] = _mm_add_epi32(v[4], v[12]);
michael@0 1728 u[5] = _mm_add_epi32(v[5], v[13]);
michael@0 1729 u[6] = _mm_add_epi32(v[6], v[14]);
michael@0 1730 u[7] = _mm_add_epi32(v[7], v[15]);
michael@0 1731 u[8] = _mm_sub_epi32(v[0], v[8]);
michael@0 1732 u[9] = _mm_sub_epi32(v[1], v[9]);
michael@0 1733 u[10] = _mm_sub_epi32(v[2], v[10]);
michael@0 1734 u[11] = _mm_sub_epi32(v[3], v[11]);
michael@0 1735 u[12] = _mm_sub_epi32(v[4], v[12]);
michael@0 1736 u[13] = _mm_sub_epi32(v[5], v[13]);
michael@0 1737 u[14] = _mm_sub_epi32(v[6], v[14]);
michael@0 1738 u[15] = _mm_sub_epi32(v[7], v[15]);
michael@0 1739
michael@0 1740 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
michael@0 1741 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
michael@0 1742 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
michael@0 1743 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
michael@0 1744 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
michael@0 1745 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
michael@0 1746 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
michael@0 1747 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
michael@0 1748 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
michael@0 1749 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
michael@0 1750 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
michael@0 1751 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
michael@0 1752 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
michael@0 1753 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
michael@0 1754 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
michael@0 1755 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
michael@0 1756
michael@0 1757 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
michael@0 1758 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
michael@0 1759 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
michael@0 1760 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
michael@0 1761 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
michael@0 1762 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
michael@0 1763 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
michael@0 1764 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
michael@0 1765 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
michael@0 1766 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
michael@0 1767 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
michael@0 1768 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
michael@0 1769 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
michael@0 1770 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
michael@0 1771 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
michael@0 1772 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
michael@0 1773
michael@0 1774 x[0] = _mm_add_epi16(s[0], s[4]);
michael@0 1775 x[1] = _mm_add_epi16(s[1], s[5]);
michael@0 1776 x[2] = _mm_add_epi16(s[2], s[6]);
michael@0 1777 x[3] = _mm_add_epi16(s[3], s[7]);
michael@0 1778 x[4] = _mm_sub_epi16(s[0], s[4]);
michael@0 1779 x[5] = _mm_sub_epi16(s[1], s[5]);
michael@0 1780 x[6] = _mm_sub_epi16(s[2], s[6]);
michael@0 1781 x[7] = _mm_sub_epi16(s[3], s[7]);
michael@0 1782 x[8] = _mm_packs_epi32(u[0], u[1]);
michael@0 1783 x[9] = _mm_packs_epi32(u[2], u[3]);
michael@0 1784 x[10] = _mm_packs_epi32(u[4], u[5]);
michael@0 1785 x[11] = _mm_packs_epi32(u[6], u[7]);
michael@0 1786 x[12] = _mm_packs_epi32(u[8], u[9]);
michael@0 1787 x[13] = _mm_packs_epi32(u[10], u[11]);
michael@0 1788 x[14] = _mm_packs_epi32(u[12], u[13]);
michael@0 1789 x[15] = _mm_packs_epi32(u[14], u[15]);
michael@0 1790
michael@0 1791 // stage 3
michael@0 1792 u[0] = _mm_unpacklo_epi16(x[4], x[5]);
michael@0 1793 u[1] = _mm_unpackhi_epi16(x[4], x[5]);
michael@0 1794 u[2] = _mm_unpacklo_epi16(x[6], x[7]);
michael@0 1795 u[3] = _mm_unpackhi_epi16(x[6], x[7]);
michael@0 1796 u[4] = _mm_unpacklo_epi16(x[12], x[13]);
michael@0 1797 u[5] = _mm_unpackhi_epi16(x[12], x[13]);
michael@0 1798 u[6] = _mm_unpacklo_epi16(x[14], x[15]);
michael@0 1799 u[7] = _mm_unpackhi_epi16(x[14], x[15]);
michael@0 1800
michael@0 1801 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
michael@0 1802 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
michael@0 1803 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
michael@0 1804 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
michael@0 1805 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
michael@0 1806 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
michael@0 1807 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
michael@0 1808 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
michael@0 1809 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
michael@0 1810 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
michael@0 1811 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
michael@0 1812 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
michael@0 1813 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
michael@0 1814 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
michael@0 1815 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
michael@0 1816 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
michael@0 1817
michael@0 1818 u[0] = _mm_add_epi32(v[0], v[4]);
michael@0 1819 u[1] = _mm_add_epi32(v[1], v[5]);
michael@0 1820 u[2] = _mm_add_epi32(v[2], v[6]);
michael@0 1821 u[3] = _mm_add_epi32(v[3], v[7]);
michael@0 1822 u[4] = _mm_sub_epi32(v[0], v[4]);
michael@0 1823 u[5] = _mm_sub_epi32(v[1], v[5]);
michael@0 1824 u[6] = _mm_sub_epi32(v[2], v[6]);
michael@0 1825 u[7] = _mm_sub_epi32(v[3], v[7]);
michael@0 1826 u[8] = _mm_add_epi32(v[8], v[12]);
michael@0 1827 u[9] = _mm_add_epi32(v[9], v[13]);
michael@0 1828 u[10] = _mm_add_epi32(v[10], v[14]);
michael@0 1829 u[11] = _mm_add_epi32(v[11], v[15]);
michael@0 1830 u[12] = _mm_sub_epi32(v[8], v[12]);
michael@0 1831 u[13] = _mm_sub_epi32(v[9], v[13]);
michael@0 1832 u[14] = _mm_sub_epi32(v[10], v[14]);
michael@0 1833 u[15] = _mm_sub_epi32(v[11], v[15]);
michael@0 1834
michael@0 1835 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
michael@0 1836 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
michael@0 1837 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
michael@0 1838 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
michael@0 1839 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
michael@0 1840 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
michael@0 1841 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
michael@0 1842 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
michael@0 1843 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
michael@0 1844 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
michael@0 1845 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
michael@0 1846 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
michael@0 1847 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
michael@0 1848 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
michael@0 1849 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
michael@0 1850 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
michael@0 1851
michael@0 1852 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
michael@0 1853 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
michael@0 1854 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
michael@0 1855 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
michael@0 1856 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
michael@0 1857 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
michael@0 1858 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
michael@0 1859 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
michael@0 1860 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
michael@0 1861 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
michael@0 1862 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
michael@0 1863 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
michael@0 1864 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
michael@0 1865 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
michael@0 1866 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
michael@0 1867 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
michael@0 1868
michael@0 1869 s[0] = _mm_add_epi16(x[0], x[2]);
michael@0 1870 s[1] = _mm_add_epi16(x[1], x[3]);
michael@0 1871 s[2] = _mm_sub_epi16(x[0], x[2]);
michael@0 1872 s[3] = _mm_sub_epi16(x[1], x[3]);
michael@0 1873 s[4] = _mm_packs_epi32(v[0], v[1]);
michael@0 1874 s[5] = _mm_packs_epi32(v[2], v[3]);
michael@0 1875 s[6] = _mm_packs_epi32(v[4], v[5]);
michael@0 1876 s[7] = _mm_packs_epi32(v[6], v[7]);
michael@0 1877 s[8] = _mm_add_epi16(x[8], x[10]);
michael@0 1878 s[9] = _mm_add_epi16(x[9], x[11]);
michael@0 1879 s[10] = _mm_sub_epi16(x[8], x[10]);
michael@0 1880 s[11] = _mm_sub_epi16(x[9], x[11]);
michael@0 1881 s[12] = _mm_packs_epi32(v[8], v[9]);
michael@0 1882 s[13] = _mm_packs_epi32(v[10], v[11]);
michael@0 1883 s[14] = _mm_packs_epi32(v[12], v[13]);
michael@0 1884 s[15] = _mm_packs_epi32(v[14], v[15]);
michael@0 1885
michael@0 1886 // stage 4
michael@0 1887 u[0] = _mm_unpacklo_epi16(s[2], s[3]);
michael@0 1888 u[1] = _mm_unpackhi_epi16(s[2], s[3]);
michael@0 1889 u[2] = _mm_unpacklo_epi16(s[6], s[7]);
michael@0 1890 u[3] = _mm_unpackhi_epi16(s[6], s[7]);
michael@0 1891 u[4] = _mm_unpacklo_epi16(s[10], s[11]);
michael@0 1892 u[5] = _mm_unpackhi_epi16(s[10], s[11]);
michael@0 1893 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
michael@0 1894 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
michael@0 1895
michael@0 1896 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
michael@0 1897 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
michael@0 1898 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
michael@0 1899 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
michael@0 1900 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
michael@0 1901 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
michael@0 1902 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
michael@0 1903 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
michael@0 1904 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
michael@0 1905 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
michael@0 1906 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
michael@0 1907 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
michael@0 1908 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
michael@0 1909 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
michael@0 1910 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
michael@0 1911 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
michael@0 1912
michael@0 1913 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
michael@0 1914 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
michael@0 1915 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
michael@0 1916 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
michael@0 1917 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
michael@0 1918 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
michael@0 1919 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
michael@0 1920 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
michael@0 1921 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
michael@0 1922 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
michael@0 1923 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
michael@0 1924 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
michael@0 1925 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
michael@0 1926 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
michael@0 1927 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
michael@0 1928 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
michael@0 1929
michael@0 1930 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
michael@0 1931 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
michael@0 1932 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
michael@0 1933 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
michael@0 1934 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
michael@0 1935 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
michael@0 1936 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
michael@0 1937 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
michael@0 1938 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
michael@0 1939 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
michael@0 1940 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
michael@0 1941 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
michael@0 1942 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
michael@0 1943 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
michael@0 1944 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
michael@0 1945 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
michael@0 1946
michael@0 1947 in[0] = s[0];
michael@0 1948 in[1] = _mm_sub_epi16(kZero, s[8]);
michael@0 1949 in[2] = s[12];
michael@0 1950 in[3] = _mm_sub_epi16(kZero, s[4]);
michael@0 1951 in[4] = _mm_packs_epi32(v[4], v[5]);
michael@0 1952 in[5] = _mm_packs_epi32(v[12], v[13]);
michael@0 1953 in[6] = _mm_packs_epi32(v[8], v[9]);
michael@0 1954 in[7] = _mm_packs_epi32(v[0], v[1]);
michael@0 1955 in[8] = _mm_packs_epi32(v[2], v[3]);
michael@0 1956 in[9] = _mm_packs_epi32(v[10], v[11]);
michael@0 1957 in[10] = _mm_packs_epi32(v[14], v[15]);
michael@0 1958 in[11] = _mm_packs_epi32(v[6], v[7]);
michael@0 1959 in[12] = s[5];
michael@0 1960 in[13] = _mm_sub_epi16(kZero, s[13]);
michael@0 1961 in[14] = s[9];
michael@0 1962 in[15] = _mm_sub_epi16(kZero, s[1]);
michael@0 1963 }
michael@0 1964
michael@0 1965 static void idct16_1d_8col(__m128i *in) {
michael@0 1966 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
michael@0 1967 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
michael@0 1968 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
michael@0 1969 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
michael@0 1970 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
michael@0 1971 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
michael@0 1972 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
michael@0 1973 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
michael@0 1974 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
michael@0 1975 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
michael@0 1976 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
michael@0 1977 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
michael@0 1978 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
michael@0 1979 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 1980 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
michael@0 1981 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
michael@0 1982 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
michael@0 1983 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
michael@0 1984 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
michael@0 1985 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
michael@0 1986 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 1987 __m128i v[16], u[16], s[16], t[16];
michael@0 1988
michael@0 1989 // stage 1
michael@0 1990 s[0] = in[0];
michael@0 1991 s[1] = in[8];
michael@0 1992 s[2] = in[4];
michael@0 1993 s[3] = in[12];
michael@0 1994 s[4] = in[2];
michael@0 1995 s[5] = in[10];
michael@0 1996 s[6] = in[6];
michael@0 1997 s[7] = in[14];
michael@0 1998 s[8] = in[1];
michael@0 1999 s[9] = in[9];
michael@0 2000 s[10] = in[5];
michael@0 2001 s[11] = in[13];
michael@0 2002 s[12] = in[3];
michael@0 2003 s[13] = in[11];
michael@0 2004 s[14] = in[7];
michael@0 2005 s[15] = in[15];
michael@0 2006
michael@0 2007 // stage 2
michael@0 2008 u[0] = _mm_unpacklo_epi16(s[8], s[15]);
michael@0 2009 u[1] = _mm_unpackhi_epi16(s[8], s[15]);
michael@0 2010 u[2] = _mm_unpacklo_epi16(s[9], s[14]);
michael@0 2011 u[3] = _mm_unpackhi_epi16(s[9], s[14]);
michael@0 2012 u[4] = _mm_unpacklo_epi16(s[10], s[13]);
michael@0 2013 u[5] = _mm_unpackhi_epi16(s[10], s[13]);
michael@0 2014 u[6] = _mm_unpacklo_epi16(s[11], s[12]);
michael@0 2015 u[7] = _mm_unpackhi_epi16(s[11], s[12]);
michael@0 2016
michael@0 2017 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
michael@0 2018 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
michael@0 2019 v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
michael@0 2020 v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
michael@0 2021 v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
michael@0 2022 v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
michael@0 2023 v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
michael@0 2024 v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
michael@0 2025 v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
michael@0 2026 v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
michael@0 2027 v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
michael@0 2028 v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
michael@0 2029 v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
michael@0 2030 v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
michael@0 2031 v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
michael@0 2032 v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
michael@0 2033
michael@0 2034 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
michael@0 2035 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
michael@0 2036 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
michael@0 2037 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
michael@0 2038 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
michael@0 2039 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
michael@0 2040 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
michael@0 2041 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
michael@0 2042 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
michael@0 2043 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
michael@0 2044 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
michael@0 2045 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
michael@0 2046 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
michael@0 2047 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
michael@0 2048 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
michael@0 2049 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
michael@0 2050
michael@0 2051 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
michael@0 2052 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
michael@0 2053 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
michael@0 2054 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
michael@0 2055 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
michael@0 2056 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
michael@0 2057 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
michael@0 2058 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
michael@0 2059 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
michael@0 2060 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
michael@0 2061 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
michael@0 2062 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
michael@0 2063 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
michael@0 2064 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
michael@0 2065 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
michael@0 2066 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
michael@0 2067
michael@0 2068 s[8] = _mm_packs_epi32(u[0], u[1]);
michael@0 2069 s[15] = _mm_packs_epi32(u[2], u[3]);
michael@0 2070 s[9] = _mm_packs_epi32(u[4], u[5]);
michael@0 2071 s[14] = _mm_packs_epi32(u[6], u[7]);
michael@0 2072 s[10] = _mm_packs_epi32(u[8], u[9]);
michael@0 2073 s[13] = _mm_packs_epi32(u[10], u[11]);
michael@0 2074 s[11] = _mm_packs_epi32(u[12], u[13]);
michael@0 2075 s[12] = _mm_packs_epi32(u[14], u[15]);
michael@0 2076
michael@0 2077 // stage 3
michael@0 2078 t[0] = s[0];
michael@0 2079 t[1] = s[1];
michael@0 2080 t[2] = s[2];
michael@0 2081 t[3] = s[3];
michael@0 2082 u[0] = _mm_unpacklo_epi16(s[4], s[7]);
michael@0 2083 u[1] = _mm_unpackhi_epi16(s[4], s[7]);
michael@0 2084 u[2] = _mm_unpacklo_epi16(s[5], s[6]);
michael@0 2085 u[3] = _mm_unpackhi_epi16(s[5], s[6]);
michael@0 2086
michael@0 2087 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
michael@0 2088 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
michael@0 2089 v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
michael@0 2090 v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
michael@0 2091 v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
michael@0 2092 v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
michael@0 2093 v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
michael@0 2094 v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
michael@0 2095
michael@0 2096 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
michael@0 2097 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
michael@0 2098 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
michael@0 2099 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
michael@0 2100 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
michael@0 2101 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
michael@0 2102 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
michael@0 2103 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
michael@0 2104
michael@0 2105 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
michael@0 2106 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
michael@0 2107 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
michael@0 2108 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
michael@0 2109 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
michael@0 2110 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
michael@0 2111 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
michael@0 2112 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
michael@0 2113
michael@0 2114 t[4] = _mm_packs_epi32(u[0], u[1]);
michael@0 2115 t[7] = _mm_packs_epi32(u[2], u[3]);
michael@0 2116 t[5] = _mm_packs_epi32(u[4], u[5]);
michael@0 2117 t[6] = _mm_packs_epi32(u[6], u[7]);
michael@0 2118 t[8] = _mm_add_epi16(s[8], s[9]);
michael@0 2119 t[9] = _mm_sub_epi16(s[8], s[9]);
michael@0 2120 t[10] = _mm_sub_epi16(s[11], s[10]);
michael@0 2121 t[11] = _mm_add_epi16(s[10], s[11]);
michael@0 2122 t[12] = _mm_add_epi16(s[12], s[13]);
michael@0 2123 t[13] = _mm_sub_epi16(s[12], s[13]);
michael@0 2124 t[14] = _mm_sub_epi16(s[15], s[14]);
michael@0 2125 t[15] = _mm_add_epi16(s[14], s[15]);
michael@0 2126
michael@0 2127 // stage 4
michael@0 2128 u[0] = _mm_unpacklo_epi16(t[0], t[1]);
michael@0 2129 u[1] = _mm_unpackhi_epi16(t[0], t[1]);
michael@0 2130 u[2] = _mm_unpacklo_epi16(t[2], t[3]);
michael@0 2131 u[3] = _mm_unpackhi_epi16(t[2], t[3]);
michael@0 2132 u[4] = _mm_unpacklo_epi16(t[9], t[14]);
michael@0 2133 u[5] = _mm_unpackhi_epi16(t[9], t[14]);
michael@0 2134 u[6] = _mm_unpacklo_epi16(t[10], t[13]);
michael@0 2135 u[7] = _mm_unpackhi_epi16(t[10], t[13]);
michael@0 2136
michael@0 2137 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
michael@0 2138 v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
michael@0 2139 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
michael@0 2140 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
michael@0 2141 v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
michael@0 2142 v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
michael@0 2143 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
michael@0 2144 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
michael@0 2145 v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
michael@0 2146 v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
michael@0 2147 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
michael@0 2148 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
michael@0 2149 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
michael@0 2150 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
michael@0 2151 v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
michael@0 2152 v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
michael@0 2153
michael@0 2154 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
michael@0 2155 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
michael@0 2156 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
michael@0 2157 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
michael@0 2158 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
michael@0 2159 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
michael@0 2160 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
michael@0 2161 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
michael@0 2162 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
michael@0 2163 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
michael@0 2164 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
michael@0 2165 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
michael@0 2166 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
michael@0 2167 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
michael@0 2168 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
michael@0 2169 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
michael@0 2170
michael@0 2171 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
michael@0 2172 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
michael@0 2173 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
michael@0 2174 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
michael@0 2175 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
michael@0 2176 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
michael@0 2177 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
michael@0 2178 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
michael@0 2179 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
michael@0 2180 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
michael@0 2181 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
michael@0 2182 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
michael@0 2183 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
michael@0 2184 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
michael@0 2185 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
michael@0 2186 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
michael@0 2187
michael@0 2188 s[0] = _mm_packs_epi32(u[0], u[1]);
michael@0 2189 s[1] = _mm_packs_epi32(u[2], u[3]);
michael@0 2190 s[2] = _mm_packs_epi32(u[4], u[5]);
michael@0 2191 s[3] = _mm_packs_epi32(u[6], u[7]);
michael@0 2192 s[4] = _mm_add_epi16(t[4], t[5]);
michael@0 2193 s[5] = _mm_sub_epi16(t[4], t[5]);
michael@0 2194 s[6] = _mm_sub_epi16(t[7], t[6]);
michael@0 2195 s[7] = _mm_add_epi16(t[6], t[7]);
michael@0 2196 s[8] = t[8];
michael@0 2197 s[15] = t[15];
michael@0 2198 s[9] = _mm_packs_epi32(u[8], u[9]);
michael@0 2199 s[14] = _mm_packs_epi32(u[10], u[11]);
michael@0 2200 s[10] = _mm_packs_epi32(u[12], u[13]);
michael@0 2201 s[13] = _mm_packs_epi32(u[14], u[15]);
michael@0 2202 s[11] = t[11];
michael@0 2203 s[12] = t[12];
michael@0 2204
michael@0 2205 // stage 5
michael@0 2206 t[0] = _mm_add_epi16(s[0], s[3]);
michael@0 2207 t[1] = _mm_add_epi16(s[1], s[2]);
michael@0 2208 t[2] = _mm_sub_epi16(s[1], s[2]);
michael@0 2209 t[3] = _mm_sub_epi16(s[0], s[3]);
michael@0 2210 t[4] = s[4];
michael@0 2211 t[7] = s[7];
michael@0 2212
michael@0 2213 u[0] = _mm_unpacklo_epi16(s[5], s[6]);
michael@0 2214 u[1] = _mm_unpackhi_epi16(s[5], s[6]);
michael@0 2215 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
michael@0 2216 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
michael@0 2217 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
michael@0 2218 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
michael@0 2219 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
michael@0 2220 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
michael@0 2221 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
michael@0 2222 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
michael@0 2223 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
michael@0 2224 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
michael@0 2225 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
michael@0 2226 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
michael@0 2227 t[5] = _mm_packs_epi32(u[0], u[1]);
michael@0 2228 t[6] = _mm_packs_epi32(u[2], u[3]);
michael@0 2229
michael@0 2230 t[8] = _mm_add_epi16(s[8], s[11]);
michael@0 2231 t[9] = _mm_add_epi16(s[9], s[10]);
michael@0 2232 t[10] = _mm_sub_epi16(s[9], s[10]);
michael@0 2233 t[11] = _mm_sub_epi16(s[8], s[11]);
michael@0 2234 t[12] = _mm_sub_epi16(s[15], s[12]);
michael@0 2235 t[13] = _mm_sub_epi16(s[14], s[13]);
michael@0 2236 t[14] = _mm_add_epi16(s[13], s[14]);
michael@0 2237 t[15] = _mm_add_epi16(s[12], s[15]);
michael@0 2238
michael@0 2239 // stage 6
michael@0 2240 s[0] = _mm_add_epi16(t[0], t[7]);
michael@0 2241 s[1] = _mm_add_epi16(t[1], t[6]);
michael@0 2242 s[2] = _mm_add_epi16(t[2], t[5]);
michael@0 2243 s[3] = _mm_add_epi16(t[3], t[4]);
michael@0 2244 s[4] = _mm_sub_epi16(t[3], t[4]);
michael@0 2245 s[5] = _mm_sub_epi16(t[2], t[5]);
michael@0 2246 s[6] = _mm_sub_epi16(t[1], t[6]);
michael@0 2247 s[7] = _mm_sub_epi16(t[0], t[7]);
michael@0 2248 s[8] = t[8];
michael@0 2249 s[9] = t[9];
michael@0 2250
michael@0 2251 u[0] = _mm_unpacklo_epi16(t[10], t[13]);
michael@0 2252 u[1] = _mm_unpackhi_epi16(t[10], t[13]);
michael@0 2253 u[2] = _mm_unpacklo_epi16(t[11], t[12]);
michael@0 2254 u[3] = _mm_unpackhi_epi16(t[11], t[12]);
michael@0 2255
michael@0 2256 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
michael@0 2257 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
michael@0 2258 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
michael@0 2259 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
michael@0 2260 v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
michael@0 2261 v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
michael@0 2262 v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
michael@0 2263 v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
michael@0 2264
michael@0 2265 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
michael@0 2266 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
michael@0 2267 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
michael@0 2268 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
michael@0 2269 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
michael@0 2270 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
michael@0 2271 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
michael@0 2272 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
michael@0 2273
michael@0 2274 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
michael@0 2275 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
michael@0 2276 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
michael@0 2277 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
michael@0 2278 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
michael@0 2279 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
michael@0 2280 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
michael@0 2281 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
michael@0 2282
michael@0 2283 s[10] = _mm_packs_epi32(u[0], u[1]);
michael@0 2284 s[13] = _mm_packs_epi32(u[2], u[3]);
michael@0 2285 s[11] = _mm_packs_epi32(u[4], u[5]);
michael@0 2286 s[12] = _mm_packs_epi32(u[6], u[7]);
michael@0 2287 s[14] = t[14];
michael@0 2288 s[15] = t[15];
michael@0 2289
michael@0 2290 // stage 7
michael@0 2291 in[0] = _mm_add_epi16(s[0], s[15]);
michael@0 2292 in[1] = _mm_add_epi16(s[1], s[14]);
michael@0 2293 in[2] = _mm_add_epi16(s[2], s[13]);
michael@0 2294 in[3] = _mm_add_epi16(s[3], s[12]);
michael@0 2295 in[4] = _mm_add_epi16(s[4], s[11]);
michael@0 2296 in[5] = _mm_add_epi16(s[5], s[10]);
michael@0 2297 in[6] = _mm_add_epi16(s[6], s[9]);
michael@0 2298 in[7] = _mm_add_epi16(s[7], s[8]);
michael@0 2299 in[8] = _mm_sub_epi16(s[7], s[8]);
michael@0 2300 in[9] = _mm_sub_epi16(s[6], s[9]);
michael@0 2301 in[10] = _mm_sub_epi16(s[5], s[10]);
michael@0 2302 in[11] = _mm_sub_epi16(s[4], s[11]);
michael@0 2303 in[12] = _mm_sub_epi16(s[3], s[12]);
michael@0 2304 in[13] = _mm_sub_epi16(s[2], s[13]);
michael@0 2305 in[14] = _mm_sub_epi16(s[1], s[14]);
michael@0 2306 in[15] = _mm_sub_epi16(s[0], s[15]);
michael@0 2307 }
michael@0 2308
michael@0 2309 static void idct16_1d_sse2(__m128i *in0, __m128i *in1) {
michael@0 2310 array_transpose_16x16(in0, in1);
michael@0 2311 idct16_1d_8col(in0);
michael@0 2312 idct16_1d_8col(in1);
michael@0 2313 }
michael@0 2314
michael@0 2315 static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) {
michael@0 2316 array_transpose_16x16(in0, in1);
michael@0 2317 iadst16_1d_8col(in0);
michael@0 2318 iadst16_1d_8col(in1);
michael@0 2319 }
michael@0 2320
michael@0 2321 static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) {
michael@0 2322 in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16));
michael@0 2323 in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16));
michael@0 2324 in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16));
michael@0 2325 in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16));
michael@0 2326 in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16));
michael@0 2327 in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16));
michael@0 2328 in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16));
michael@0 2329 in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16));
michael@0 2330
michael@0 2331 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16));
michael@0 2332 in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16));
michael@0 2333 in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16));
michael@0 2334 in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16));
michael@0 2335 in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16));
michael@0 2336 in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16));
michael@0 2337 in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16));
michael@0 2338 in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16));
michael@0 2339 }
michael@0 2340
michael@0 2341 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
michael@0 2342 const __m128i final_rounding = _mm_set1_epi16(1<<5);
michael@0 2343 const __m128i zero = _mm_setzero_si128();
michael@0 2344 // Final rounding and shift
michael@0 2345 in[0] = _mm_adds_epi16(in[0], final_rounding);
michael@0 2346 in[1] = _mm_adds_epi16(in[1], final_rounding);
michael@0 2347 in[2] = _mm_adds_epi16(in[2], final_rounding);
michael@0 2348 in[3] = _mm_adds_epi16(in[3], final_rounding);
michael@0 2349 in[4] = _mm_adds_epi16(in[4], final_rounding);
michael@0 2350 in[5] = _mm_adds_epi16(in[5], final_rounding);
michael@0 2351 in[6] = _mm_adds_epi16(in[6], final_rounding);
michael@0 2352 in[7] = _mm_adds_epi16(in[7], final_rounding);
michael@0 2353 in[8] = _mm_adds_epi16(in[8], final_rounding);
michael@0 2354 in[9] = _mm_adds_epi16(in[9], final_rounding);
michael@0 2355 in[10] = _mm_adds_epi16(in[10], final_rounding);
michael@0 2356 in[11] = _mm_adds_epi16(in[11], final_rounding);
michael@0 2357 in[12] = _mm_adds_epi16(in[12], final_rounding);
michael@0 2358 in[13] = _mm_adds_epi16(in[13], final_rounding);
michael@0 2359 in[14] = _mm_adds_epi16(in[14], final_rounding);
michael@0 2360 in[15] = _mm_adds_epi16(in[15], final_rounding);
michael@0 2361
michael@0 2362 in[0] = _mm_srai_epi16(in[0], 6);
michael@0 2363 in[1] = _mm_srai_epi16(in[1], 6);
michael@0 2364 in[2] = _mm_srai_epi16(in[2], 6);
michael@0 2365 in[3] = _mm_srai_epi16(in[3], 6);
michael@0 2366 in[4] = _mm_srai_epi16(in[4], 6);
michael@0 2367 in[5] = _mm_srai_epi16(in[5], 6);
michael@0 2368 in[6] = _mm_srai_epi16(in[6], 6);
michael@0 2369 in[7] = _mm_srai_epi16(in[7], 6);
michael@0 2370 in[8] = _mm_srai_epi16(in[8], 6);
michael@0 2371 in[9] = _mm_srai_epi16(in[9], 6);
michael@0 2372 in[10] = _mm_srai_epi16(in[10], 6);
michael@0 2373 in[11] = _mm_srai_epi16(in[11], 6);
michael@0 2374 in[12] = _mm_srai_epi16(in[12], 6);
michael@0 2375 in[13] = _mm_srai_epi16(in[13], 6);
michael@0 2376 in[14] = _mm_srai_epi16(in[14], 6);
michael@0 2377 in[15] = _mm_srai_epi16(in[15], 6);
michael@0 2378
michael@0 2379 RECON_AND_STORE(dest, in[0]);
michael@0 2380 RECON_AND_STORE(dest, in[1]);
michael@0 2381 RECON_AND_STORE(dest, in[2]);
michael@0 2382 RECON_AND_STORE(dest, in[3]);
michael@0 2383 RECON_AND_STORE(dest, in[4]);
michael@0 2384 RECON_AND_STORE(dest, in[5]);
michael@0 2385 RECON_AND_STORE(dest, in[6]);
michael@0 2386 RECON_AND_STORE(dest, in[7]);
michael@0 2387 RECON_AND_STORE(dest, in[8]);
michael@0 2388 RECON_AND_STORE(dest, in[9]);
michael@0 2389 RECON_AND_STORE(dest, in[10]);
michael@0 2390 RECON_AND_STORE(dest, in[11]);
michael@0 2391 RECON_AND_STORE(dest, in[12]);
michael@0 2392 RECON_AND_STORE(dest, in[13]);
michael@0 2393 RECON_AND_STORE(dest, in[14]);
michael@0 2394 RECON_AND_STORE(dest, in[15]);
michael@0 2395 }
michael@0 2396
michael@0 2397 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride,
michael@0 2398 int tx_type) {
michael@0 2399 __m128i in0[16], in1[16];
michael@0 2400
michael@0 2401 load_buffer_8x16(input, in0);
michael@0 2402 input += 8;
michael@0 2403 load_buffer_8x16(input, in1);
michael@0 2404
michael@0 2405 switch (tx_type) {
michael@0 2406 case 0: // DCT_DCT
michael@0 2407 idct16_1d_sse2(in0, in1);
michael@0 2408 idct16_1d_sse2(in0, in1);
michael@0 2409 break;
michael@0 2410 case 1: // ADST_DCT
michael@0 2411 idct16_1d_sse2(in0, in1);
michael@0 2412 iadst16_1d_sse2(in0, in1);
michael@0 2413 break;
michael@0 2414 case 2: // DCT_ADST
michael@0 2415 iadst16_1d_sse2(in0, in1);
michael@0 2416 idct16_1d_sse2(in0, in1);
michael@0 2417 break;
michael@0 2418 case 3: // ADST_ADST
michael@0 2419 iadst16_1d_sse2(in0, in1);
michael@0 2420 iadst16_1d_sse2(in0, in1);
michael@0 2421 break;
michael@0 2422 default:
michael@0 2423 assert(0);
michael@0 2424 break;
michael@0 2425 }
michael@0 2426
michael@0 2427 write_buffer_8x16(dest, in0, stride);
michael@0 2428 dest += 8;
michael@0 2429 write_buffer_8x16(dest, in1, stride);
michael@0 2430 }
michael@0 2431
michael@0 2432 void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest,
michael@0 2433 int stride) {
michael@0 2434 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 2435 const __m128i final_rounding = _mm_set1_epi16(1<<5);
michael@0 2436 const __m128i zero = _mm_setzero_si128();
michael@0 2437
michael@0 2438 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
michael@0 2439 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
michael@0 2440 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
michael@0 2441 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
michael@0 2442 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
michael@0 2443 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
michael@0 2444 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
michael@0 2445 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
michael@0 2446
michael@0 2447 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
michael@0 2448 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
michael@0 2449 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
michael@0 2450 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
michael@0 2451
michael@0 2452 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
michael@0 2453 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 2454 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
michael@0 2455 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
michael@0 2456 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
michael@0 2457 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
michael@0 2458 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
michael@0 2459 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
michael@0 2460
michael@0 2461 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
michael@0 2462
michael@0 2463 __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
michael@0 2464 in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
michael@0 2465 in10 = zero, in11 = zero, in12 = zero, in13 = zero,
michael@0 2466 in14 = zero, in15 = zero;
michael@0 2467 __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
michael@0 2468 l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
michael@0 2469 l12 = zero, l13 = zero, l14 = zero, l15 = zero;
michael@0 2470
michael@0 2471 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
michael@0 2472 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
michael@0 2473 stp1_8_0, stp1_12_0;
michael@0 2474 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
michael@0 2475 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
michael@0 2476 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
michael@0 2477 int i;
michael@0 2478 // 1-D idct. Load input data.
michael@0 2479 in0 = _mm_load_si128((const __m128i *)input);
michael@0 2480 in8 = _mm_load_si128((const __m128i *)(input + 8 * 1));
michael@0 2481 in1 = _mm_load_si128((const __m128i *)(input + 8 * 2));
michael@0 2482 in9 = _mm_load_si128((const __m128i *)(input + 8 * 3));
michael@0 2483 in2 = _mm_load_si128((const __m128i *)(input + 8 * 4));
michael@0 2484 in10 = _mm_load_si128((const __m128i *)(input + 8 * 5));
michael@0 2485 in3 = _mm_load_si128((const __m128i *)(input + 8 * 6));
michael@0 2486 in11 = _mm_load_si128((const __m128i *)(input + 8 * 7));
michael@0 2487
michael@0 2488 TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
michael@0 2489 TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
michael@0 2490
michael@0 2491 // Stage2
michael@0 2492 {
michael@0 2493 const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11);
michael@0 2494 const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3);
michael@0 2495 const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9);
michael@0 2496 const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1);
michael@0 2497
michael@0 2498 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
michael@0 2499 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
michael@0 2500 tmp4 = _mm_madd_epi16(lo_9_7, stg2_2);
michael@0 2501 tmp6 = _mm_madd_epi16(lo_9_7, stg2_3);
michael@0 2502 tmp1 = _mm_madd_epi16(lo_5_11, stg2_4);
michael@0 2503 tmp3 = _mm_madd_epi16(lo_5_11, stg2_5);
michael@0 2504 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
michael@0 2505 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
michael@0 2506
michael@0 2507 tmp0 = _mm_add_epi32(tmp0, rounding);
michael@0 2508 tmp2 = _mm_add_epi32(tmp2, rounding);
michael@0 2509 tmp4 = _mm_add_epi32(tmp4, rounding);
michael@0 2510 tmp6 = _mm_add_epi32(tmp6, rounding);
michael@0 2511 tmp1 = _mm_add_epi32(tmp1, rounding);
michael@0 2512 tmp3 = _mm_add_epi32(tmp3, rounding);
michael@0 2513 tmp5 = _mm_add_epi32(tmp5, rounding);
michael@0 2514 tmp7 = _mm_add_epi32(tmp7, rounding);
michael@0 2515
michael@0 2516 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
michael@0 2517 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
michael@0 2518 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
michael@0 2519 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
michael@0 2520 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
michael@0 2521 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
michael@0 2522 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
michael@0 2523 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
michael@0 2524
michael@0 2525 stp2_8 = _mm_packs_epi32(tmp0, zero);
michael@0 2526 stp2_15 = _mm_packs_epi32(tmp2, zero);
michael@0 2527 stp2_9 = _mm_packs_epi32(tmp4, zero);
michael@0 2528 stp2_14 = _mm_packs_epi32(tmp6, zero);
michael@0 2529
michael@0 2530 stp2_10 = _mm_packs_epi32(tmp1, zero);
michael@0 2531 stp2_13 = _mm_packs_epi32(tmp3, zero);
michael@0 2532 stp2_11 = _mm_packs_epi32(tmp5, zero);
michael@0 2533 stp2_12 = _mm_packs_epi32(tmp7, zero);
michael@0 2534 }
michael@0 2535
michael@0 2536 // Stage3
michael@0 2537 {
michael@0 2538 const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11);
michael@0 2539 const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3);
michael@0 2540
michael@0 2541 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
michael@0 2542 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
michael@0 2543 tmp4 = _mm_madd_epi16(lo_10_6, stg3_2);
michael@0 2544 tmp6 = _mm_madd_epi16(lo_10_6, stg3_3);
michael@0 2545
michael@0 2546 tmp0 = _mm_add_epi32(tmp0, rounding);
michael@0 2547 tmp2 = _mm_add_epi32(tmp2, rounding);
michael@0 2548 tmp4 = _mm_add_epi32(tmp4, rounding);
michael@0 2549 tmp6 = _mm_add_epi32(tmp6, rounding);
michael@0 2550
michael@0 2551 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
michael@0 2552 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
michael@0 2553 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
michael@0 2554 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
michael@0 2555
michael@0 2556 stp1_4 = _mm_packs_epi32(tmp0, zero);
michael@0 2557 stp1_7 = _mm_packs_epi32(tmp2, zero);
michael@0 2558 stp1_5 = _mm_packs_epi32(tmp4, zero);
michael@0 2559 stp1_6 = _mm_packs_epi32(tmp6, zero);
michael@0 2560
michael@0 2561 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);
michael@0 2562 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
michael@0 2563 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
michael@0 2564 stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
michael@0 2565
michael@0 2566 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);
michael@0 2567 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
michael@0 2568 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
michael@0 2569 stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
michael@0 2570 }
michael@0 2571
michael@0 2572 // Stage4
michael@0 2573 {
michael@0 2574 const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8);
michael@0 2575 const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10);
michael@0 2576 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
michael@0 2577 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
michael@0 2578
michael@0 2579 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
michael@0 2580 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
michael@0 2581 tmp4 = _mm_madd_epi16(lo_4_12, stg4_2);
michael@0 2582 tmp6 = _mm_madd_epi16(lo_4_12, stg4_3);
michael@0 2583 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
michael@0 2584 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
michael@0 2585 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
michael@0 2586 tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
michael@0 2587
michael@0 2588 tmp0 = _mm_add_epi32(tmp0, rounding);
michael@0 2589 tmp2 = _mm_add_epi32(tmp2, rounding);
michael@0 2590 tmp4 = _mm_add_epi32(tmp4, rounding);
michael@0 2591 tmp6 = _mm_add_epi32(tmp6, rounding);
michael@0 2592 tmp1 = _mm_add_epi32(tmp1, rounding);
michael@0 2593 tmp3 = _mm_add_epi32(tmp3, rounding);
michael@0 2594 tmp5 = _mm_add_epi32(tmp5, rounding);
michael@0 2595 tmp7 = _mm_add_epi32(tmp7, rounding);
michael@0 2596
michael@0 2597 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
michael@0 2598 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
michael@0 2599 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
michael@0 2600 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
michael@0 2601 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
michael@0 2602 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
michael@0 2603 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
michael@0 2604 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
michael@0 2605
michael@0 2606 stp2_0 = _mm_packs_epi32(tmp0, zero);
michael@0 2607 stp2_1 = _mm_packs_epi32(tmp2, zero);
michael@0 2608 stp2_2 = _mm_packs_epi32(tmp4, zero);
michael@0 2609 stp2_3 = _mm_packs_epi32(tmp6, zero);
michael@0 2610 stp2_9 = _mm_packs_epi32(tmp1, zero);
michael@0 2611 stp2_14 = _mm_packs_epi32(tmp3, zero);
michael@0 2612 stp2_10 = _mm_packs_epi32(tmp5, zero);
michael@0 2613 stp2_13 = _mm_packs_epi32(tmp7, zero);
michael@0 2614
michael@0 2615 stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
michael@0 2616 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
michael@0 2617 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
michael@0 2618 stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
michael@0 2619 }
michael@0 2620
michael@0 2621 // Stage5 and Stage6
michael@0 2622 {
michael@0 2623 stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
michael@0 2624 stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
michael@0 2625 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
michael@0 2626 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
michael@0 2627
michael@0 2628 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
michael@0 2629 stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
michael@0 2630 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
michael@0 2631 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
michael@0 2632
michael@0 2633 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
michael@0 2634 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
michael@0 2635 stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
michael@0 2636 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
michael@0 2637 }
michael@0 2638
michael@0 2639 // Stage6
michael@0 2640 {
michael@0 2641 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
michael@0 2642 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
michael@0 2643 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
michael@0 2644
michael@0 2645 tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
michael@0 2646 tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
michael@0 2647 tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
michael@0 2648 tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
michael@0 2649 tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
michael@0 2650 tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
michael@0 2651
michael@0 2652 tmp1 = _mm_add_epi32(tmp1, rounding);
michael@0 2653 tmp3 = _mm_add_epi32(tmp3, rounding);
michael@0 2654 tmp0 = _mm_add_epi32(tmp0, rounding);
michael@0 2655 tmp2 = _mm_add_epi32(tmp2, rounding);
michael@0 2656 tmp4 = _mm_add_epi32(tmp4, rounding);
michael@0 2657 tmp6 = _mm_add_epi32(tmp6, rounding);
michael@0 2658
michael@0 2659 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
michael@0 2660 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
michael@0 2661 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
michael@0 2662 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
michael@0 2663 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
michael@0 2664 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
michael@0 2665
michael@0 2666 stp1_5 = _mm_packs_epi32(tmp1, zero);
michael@0 2667 stp1_6 = _mm_packs_epi32(tmp3, zero);
michael@0 2668 stp2_10 = _mm_packs_epi32(tmp0, zero);
michael@0 2669 stp2_13 = _mm_packs_epi32(tmp2, zero);
michael@0 2670 stp2_11 = _mm_packs_epi32(tmp4, zero);
michael@0 2671 stp2_12 = _mm_packs_epi32(tmp6, zero);
michael@0 2672
michael@0 2673 stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
michael@0 2674 stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
michael@0 2675 stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
michael@0 2676 stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
michael@0 2677 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
michael@0 2678 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
michael@0 2679 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
michael@0 2680 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
michael@0 2681 }
michael@0 2682
michael@0 2683 // Stage7. Left 8x16 only.
michael@0 2684 l0 = _mm_add_epi16(stp2_0, stp1_15);
michael@0 2685 l1 = _mm_add_epi16(stp2_1, stp1_14);
michael@0 2686 l2 = _mm_add_epi16(stp2_2, stp2_13);
michael@0 2687 l3 = _mm_add_epi16(stp2_3, stp2_12);
michael@0 2688 l4 = _mm_add_epi16(stp2_4, stp2_11);
michael@0 2689 l5 = _mm_add_epi16(stp2_5, stp2_10);
michael@0 2690 l6 = _mm_add_epi16(stp2_6, stp1_9);
michael@0 2691 l7 = _mm_add_epi16(stp2_7, stp1_8);
michael@0 2692 l8 = _mm_sub_epi16(stp2_7, stp1_8);
michael@0 2693 l9 = _mm_sub_epi16(stp2_6, stp1_9);
michael@0 2694 l10 = _mm_sub_epi16(stp2_5, stp2_10);
michael@0 2695 l11 = _mm_sub_epi16(stp2_4, stp2_11);
michael@0 2696 l12 = _mm_sub_epi16(stp2_3, stp2_12);
michael@0 2697 l13 = _mm_sub_epi16(stp2_2, stp2_13);
michael@0 2698 l14 = _mm_sub_epi16(stp2_1, stp1_14);
michael@0 2699 l15 = _mm_sub_epi16(stp2_0, stp1_15);
michael@0 2700
michael@0 2701 // 2-D idct. We do 2 8x16 blocks.
michael@0 2702 for (i = 0; i < 2; i++) {
michael@0 2703 if (i == 0)
michael@0 2704 TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
michael@0 2705 in5, in6, in7);
michael@0 2706
michael@0 2707 if (i == 1)
michael@0 2708 TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
michael@0 2709 in4, in5, in6, in7);
michael@0 2710
michael@0 2711 in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
michael@0 2712
michael@0 2713 IDCT16_1D
michael@0 2714
michael@0 2715 // Stage7
michael@0 2716 in0 = _mm_add_epi16(stp2_0, stp1_15);
michael@0 2717 in1 = _mm_add_epi16(stp2_1, stp1_14);
michael@0 2718 in2 = _mm_add_epi16(stp2_2, stp2_13);
michael@0 2719 in3 = _mm_add_epi16(stp2_3, stp2_12);
michael@0 2720 in4 = _mm_add_epi16(stp2_4, stp2_11);
michael@0 2721 in5 = _mm_add_epi16(stp2_5, stp2_10);
michael@0 2722 in6 = _mm_add_epi16(stp2_6, stp1_9);
michael@0 2723 in7 = _mm_add_epi16(stp2_7, stp1_8);
michael@0 2724 in8 = _mm_sub_epi16(stp2_7, stp1_8);
michael@0 2725 in9 = _mm_sub_epi16(stp2_6, stp1_9);
michael@0 2726 in10 = _mm_sub_epi16(stp2_5, stp2_10);
michael@0 2727 in11 = _mm_sub_epi16(stp2_4, stp2_11);
michael@0 2728 in12 = _mm_sub_epi16(stp2_3, stp2_12);
michael@0 2729 in13 = _mm_sub_epi16(stp2_2, stp2_13);
michael@0 2730 in14 = _mm_sub_epi16(stp2_1, stp1_14);
michael@0 2731 in15 = _mm_sub_epi16(stp2_0, stp1_15);
michael@0 2732
michael@0 2733 // Final rounding and shift
michael@0 2734 in0 = _mm_adds_epi16(in0, final_rounding);
michael@0 2735 in1 = _mm_adds_epi16(in1, final_rounding);
michael@0 2736 in2 = _mm_adds_epi16(in2, final_rounding);
michael@0 2737 in3 = _mm_adds_epi16(in3, final_rounding);
michael@0 2738 in4 = _mm_adds_epi16(in4, final_rounding);
michael@0 2739 in5 = _mm_adds_epi16(in5, final_rounding);
michael@0 2740 in6 = _mm_adds_epi16(in6, final_rounding);
michael@0 2741 in7 = _mm_adds_epi16(in7, final_rounding);
michael@0 2742 in8 = _mm_adds_epi16(in8, final_rounding);
michael@0 2743 in9 = _mm_adds_epi16(in9, final_rounding);
michael@0 2744 in10 = _mm_adds_epi16(in10, final_rounding);
michael@0 2745 in11 = _mm_adds_epi16(in11, final_rounding);
michael@0 2746 in12 = _mm_adds_epi16(in12, final_rounding);
michael@0 2747 in13 = _mm_adds_epi16(in13, final_rounding);
michael@0 2748 in14 = _mm_adds_epi16(in14, final_rounding);
michael@0 2749 in15 = _mm_adds_epi16(in15, final_rounding);
michael@0 2750
michael@0 2751 in0 = _mm_srai_epi16(in0, 6);
michael@0 2752 in1 = _mm_srai_epi16(in1, 6);
michael@0 2753 in2 = _mm_srai_epi16(in2, 6);
michael@0 2754 in3 = _mm_srai_epi16(in3, 6);
michael@0 2755 in4 = _mm_srai_epi16(in4, 6);
michael@0 2756 in5 = _mm_srai_epi16(in5, 6);
michael@0 2757 in6 = _mm_srai_epi16(in6, 6);
michael@0 2758 in7 = _mm_srai_epi16(in7, 6);
michael@0 2759 in8 = _mm_srai_epi16(in8, 6);
michael@0 2760 in9 = _mm_srai_epi16(in9, 6);
michael@0 2761 in10 = _mm_srai_epi16(in10, 6);
michael@0 2762 in11 = _mm_srai_epi16(in11, 6);
michael@0 2763 in12 = _mm_srai_epi16(in12, 6);
michael@0 2764 in13 = _mm_srai_epi16(in13, 6);
michael@0 2765 in14 = _mm_srai_epi16(in14, 6);
michael@0 2766 in15 = _mm_srai_epi16(in15, 6);
michael@0 2767
michael@0 2768 RECON_AND_STORE(dest, in0);
michael@0 2769 RECON_AND_STORE(dest, in1);
michael@0 2770 RECON_AND_STORE(dest, in2);
michael@0 2771 RECON_AND_STORE(dest, in3);
michael@0 2772 RECON_AND_STORE(dest, in4);
michael@0 2773 RECON_AND_STORE(dest, in5);
michael@0 2774 RECON_AND_STORE(dest, in6);
michael@0 2775 RECON_AND_STORE(dest, in7);
michael@0 2776 RECON_AND_STORE(dest, in8);
michael@0 2777 RECON_AND_STORE(dest, in9);
michael@0 2778 RECON_AND_STORE(dest, in10);
michael@0 2779 RECON_AND_STORE(dest, in11);
michael@0 2780 RECON_AND_STORE(dest, in12);
michael@0 2781 RECON_AND_STORE(dest, in13);
michael@0 2782 RECON_AND_STORE(dest, in14);
michael@0 2783 RECON_AND_STORE(dest, in15);
michael@0 2784
michael@0 2785 dest += 8 - (stride * 16);
michael@0 2786 }
michael@0 2787 }
michael@0 2788
michael@0 2789 #define LOAD_DQCOEFF(reg, input) \
michael@0 2790 { \
michael@0 2791 reg = _mm_load_si128((const __m128i *) input); \
michael@0 2792 input += 8; \
michael@0 2793 } \
michael@0 2794
michael@0 2795 #define IDCT32_1D \
michael@0 2796 /* Stage1 */ \
michael@0 2797 { \
michael@0 2798 const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \
michael@0 2799 const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \
michael@0 2800 const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \
michael@0 2801 const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \
michael@0 2802 \
michael@0 2803 const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \
michael@0 2804 const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \
michael@0 2805 const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \
michael@0 2806 const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \
michael@0 2807 \
michael@0 2808 const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \
michael@0 2809 const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \
michael@0 2810 const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \
michael@0 2811 const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \
michael@0 2812 \
michael@0 2813 const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \
michael@0 2814 const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \
michael@0 2815 const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \
michael@0 2816 const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \
michael@0 2817 \
michael@0 2818 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \
michael@0 2819 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \
michael@0 2820 stp1_17, stp1_30) \
michael@0 2821 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \
michael@0 2822 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \
michael@0 2823 stp1_19, stp1_28) \
michael@0 2824 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \
michael@0 2825 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \
michael@0 2826 stp1_21, stp1_26) \
michael@0 2827 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \
michael@0 2828 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \
michael@0 2829 stp1_23, stp1_24) \
michael@0 2830 } \
michael@0 2831 \
michael@0 2832 /* Stage2 */ \
michael@0 2833 { \
michael@0 2834 const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \
michael@0 2835 const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \
michael@0 2836 const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \
michael@0 2837 const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \
michael@0 2838 \
michael@0 2839 const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \
michael@0 2840 const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \
michael@0 2841 const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \
michael@0 2842 const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \
michael@0 2843 \
michael@0 2844 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \
michael@0 2845 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \
michael@0 2846 stp2_14) \
michael@0 2847 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \
michael@0 2848 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \
michael@0 2849 stp2_11, stp2_12) \
michael@0 2850 \
michael@0 2851 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \
michael@0 2852 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \
michael@0 2853 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \
michael@0 2854 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \
michael@0 2855 \
michael@0 2856 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \
michael@0 2857 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \
michael@0 2858 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \
michael@0 2859 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \
michael@0 2860 \
michael@0 2861 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \
michael@0 2862 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \
michael@0 2863 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \
michael@0 2864 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \
michael@0 2865 \
michael@0 2866 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \
michael@0 2867 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \
michael@0 2868 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \
michael@0 2869 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \
michael@0 2870 } \
michael@0 2871 \
michael@0 2872 /* Stage3 */ \
michael@0 2873 { \
michael@0 2874 const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \
michael@0 2875 const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \
michael@0 2876 const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \
michael@0 2877 const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \
michael@0 2878 \
michael@0 2879 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \
michael@0 2880 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \
michael@0 2881 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
michael@0 2882 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
michael@0 2883 \
michael@0 2884 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
michael@0 2885 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
michael@0 2886 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
michael@0 2887 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
michael@0 2888 \
michael@0 2889 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \
michael@0 2890 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \
michael@0 2891 stp1_6) \
michael@0 2892 \
michael@0 2893 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \
michael@0 2894 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
michael@0 2895 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
michael@0 2896 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
michael@0 2897 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \
michael@0 2898 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
michael@0 2899 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
michael@0 2900 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
michael@0 2901 \
michael@0 2902 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \
michael@0 2903 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \
michael@0 2904 stp1_18, stp1_29) \
michael@0 2905 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \
michael@0 2906 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \
michael@0 2907 stp1_22, stp1_25) \
michael@0 2908 \
michael@0 2909 stp1_16 = stp2_16; \
michael@0 2910 stp1_31 = stp2_31; \
michael@0 2911 stp1_19 = stp2_19; \
michael@0 2912 stp1_20 = stp2_20; \
michael@0 2913 stp1_23 = stp2_23; \
michael@0 2914 stp1_24 = stp2_24; \
michael@0 2915 stp1_27 = stp2_27; \
michael@0 2916 stp1_28 = stp2_28; \
michael@0 2917 } \
michael@0 2918 \
michael@0 2919 /* Stage4 */ \
michael@0 2920 { \
michael@0 2921 const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \
michael@0 2922 const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \
michael@0 2923 const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \
michael@0 2924 const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \
michael@0 2925 \
michael@0 2926 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
michael@0 2927 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
michael@0 2928 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
michael@0 2929 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
michael@0 2930 \
michael@0 2931 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \
michael@0 2932 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \
michael@0 2933 stp2_2, stp2_3) \
michael@0 2934 \
michael@0 2935 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
michael@0 2936 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
michael@0 2937 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
michael@0 2938 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
michael@0 2939 \
michael@0 2940 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \
michael@0 2941 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \
michael@0 2942 stp2_10, stp2_13) \
michael@0 2943 \
michael@0 2944 stp2_8 = stp1_8; \
michael@0 2945 stp2_15 = stp1_15; \
michael@0 2946 stp2_11 = stp1_11; \
michael@0 2947 stp2_12 = stp1_12; \
michael@0 2948 \
michael@0 2949 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \
michael@0 2950 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \
michael@0 2951 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \
michael@0 2952 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \
michael@0 2953 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \
michael@0 2954 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \
michael@0 2955 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \
michael@0 2956 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \
michael@0 2957 \
michael@0 2958 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \
michael@0 2959 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \
michael@0 2960 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \
michael@0 2961 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \
michael@0 2962 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \
michael@0 2963 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \
michael@0 2964 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \
michael@0 2965 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \
michael@0 2966 } \
michael@0 2967 \
michael@0 2968 /* Stage5 */ \
michael@0 2969 { \
michael@0 2970 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
michael@0 2971 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
michael@0 2972 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \
michael@0 2973 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \
michael@0 2974 \
michael@0 2975 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \
michael@0 2976 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \
michael@0 2977 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
michael@0 2978 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
michael@0 2979 \
michael@0 2980 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
michael@0 2981 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
michael@0 2982 \
michael@0 2983 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
michael@0 2984 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
michael@0 2985 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
michael@0 2986 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
michael@0 2987 \
michael@0 2988 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
michael@0 2989 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
michael@0 2990 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
michael@0 2991 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
michael@0 2992 \
michael@0 2993 tmp0 = _mm_add_epi32(tmp0, rounding); \
michael@0 2994 tmp1 = _mm_add_epi32(tmp1, rounding); \
michael@0 2995 tmp2 = _mm_add_epi32(tmp2, rounding); \
michael@0 2996 tmp3 = _mm_add_epi32(tmp3, rounding); \
michael@0 2997 \
michael@0 2998 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
michael@0 2999 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
michael@0 3000 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
michael@0 3001 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
michael@0 3002 \
michael@0 3003 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
michael@0 3004 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
michael@0 3005 \
michael@0 3006 stp1_4 = stp2_4; \
michael@0 3007 stp1_7 = stp2_7; \
michael@0 3008 \
michael@0 3009 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \
michael@0 3010 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
michael@0 3011 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
michael@0 3012 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \
michael@0 3013 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \
michael@0 3014 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
michael@0 3015 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
michael@0 3016 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \
michael@0 3017 \
michael@0 3018 stp1_16 = stp2_16; \
michael@0 3019 stp1_17 = stp2_17; \
michael@0 3020 \
michael@0 3021 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \
michael@0 3022 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \
michael@0 3023 stp1_19, stp1_28) \
michael@0 3024 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \
michael@0 3025 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \
michael@0 3026 stp1_21, stp1_26) \
michael@0 3027 \
michael@0 3028 stp1_22 = stp2_22; \
michael@0 3029 stp1_23 = stp2_23; \
michael@0 3030 stp1_24 = stp2_24; \
michael@0 3031 stp1_25 = stp2_25; \
michael@0 3032 stp1_30 = stp2_30; \
michael@0 3033 stp1_31 = stp2_31; \
michael@0 3034 } \
michael@0 3035 \
michael@0 3036 /* Stage6 */ \
michael@0 3037 { \
michael@0 3038 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
michael@0 3039 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
michael@0 3040 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
michael@0 3041 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
michael@0 3042 \
michael@0 3043 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \
michael@0 3044 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
michael@0 3045 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
michael@0 3046 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \
michael@0 3047 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \
michael@0 3048 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
michael@0 3049 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
michael@0 3050 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \
michael@0 3051 \
michael@0 3052 stp2_8 = stp1_8; \
michael@0 3053 stp2_9 = stp1_9; \
michael@0 3054 stp2_14 = stp1_14; \
michael@0 3055 stp2_15 = stp1_15; \
michael@0 3056 \
michael@0 3057 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
michael@0 3058 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \
michael@0 3059 stp2_13, stp2_11, stp2_12) \
michael@0 3060 \
michael@0 3061 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \
michael@0 3062 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \
michael@0 3063 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \
michael@0 3064 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \
michael@0 3065 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \
michael@0 3066 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \
michael@0 3067 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \
michael@0 3068 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \
michael@0 3069 \
michael@0 3070 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \
michael@0 3071 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \
michael@0 3072 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \
michael@0 3073 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \
michael@0 3074 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \
michael@0 3075 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \
michael@0 3076 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \
michael@0 3077 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \
michael@0 3078 } \
michael@0 3079 \
michael@0 3080 /* Stage7 */ \
michael@0 3081 { \
michael@0 3082 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \
michael@0 3083 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \
michael@0 3084 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \
michael@0 3085 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \
michael@0 3086 \
michael@0 3087 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \
michael@0 3088 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \
michael@0 3089 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \
michael@0 3090 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \
michael@0 3091 \
michael@0 3092 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \
michael@0 3093 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \
michael@0 3094 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \
michael@0 3095 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \
michael@0 3096 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \
michael@0 3097 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \
michael@0 3098 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \
michael@0 3099 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \
michael@0 3100 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \
michael@0 3101 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \
michael@0 3102 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \
michael@0 3103 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \
michael@0 3104 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \
michael@0 3105 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \
michael@0 3106 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \
michael@0 3107 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \
michael@0 3108 \
michael@0 3109 stp1_16 = stp2_16; \
michael@0 3110 stp1_17 = stp2_17; \
michael@0 3111 stp1_18 = stp2_18; \
michael@0 3112 stp1_19 = stp2_19; \
michael@0 3113 \
michael@0 3114 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \
michael@0 3115 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \
michael@0 3116 stp1_21, stp1_26) \
michael@0 3117 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \
michael@0 3118 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \
michael@0 3119 stp1_23, stp1_24) \
michael@0 3120 \
michael@0 3121 stp1_28 = stp2_28; \
michael@0 3122 stp1_29 = stp2_29; \
michael@0 3123 stp1_30 = stp2_30; \
michael@0 3124 stp1_31 = stp2_31; \
michael@0 3125 }
michael@0 3126
michael@0 3127 // Only upper-left 8x8 has non-zero coeff
michael@0 3128 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest,
michael@0 3129 int stride) {
michael@0 3130 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 3131 const __m128i final_rounding = _mm_set1_epi16(1<<5);
michael@0 3132
michael@0 3133 // idct constants for each stage
michael@0 3134 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
michael@0 3135 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
michael@0 3136 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
michael@0 3137 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
michael@0 3138 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
michael@0 3139 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
michael@0 3140 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
michael@0 3141 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
michael@0 3142 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
michael@0 3143 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
michael@0 3144 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
michael@0 3145 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
michael@0 3146 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
michael@0 3147 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
michael@0 3148 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
michael@0 3149 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
michael@0 3150
michael@0 3151 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
michael@0 3152 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
michael@0 3153 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
michael@0 3154 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
michael@0 3155 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
michael@0 3156 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
michael@0 3157 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
michael@0 3158 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
michael@0 3159
michael@0 3160 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
michael@0 3161 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
michael@0 3162 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
michael@0 3163 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
michael@0 3164 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
michael@0 3165 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
michael@0 3166 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
michael@0 3167 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
michael@0 3168 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
michael@0 3169 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
michael@0 3170
michael@0 3171 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
michael@0 3172 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 3173 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
michael@0 3174 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
michael@0 3175 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
michael@0 3176 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
michael@0 3177 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
michael@0 3178
michael@0 3179 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
michael@0 3180
michael@0 3181 __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
michael@0 3182 in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
michael@0 3183 in24, in25, in26, in27, in28, in29, in30, in31;
michael@0 3184 __m128i col[128];
michael@0 3185 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
michael@0 3186 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
michael@0 3187 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
michael@0 3188 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
michael@0 3189 stp1_30, stp1_31;
michael@0 3190 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
michael@0 3191 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
michael@0 3192 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
michael@0 3193 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
michael@0 3194 stp2_30, stp2_31;
michael@0 3195 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
michael@0 3196 int i, j, i32;
michael@0 3197
michael@0 3198 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
michael@0 3199 for (i = 0; i < 8; i++) {
michael@0 3200 i32 = (i << 5);
michael@0 3201 if (i == 0) {
michael@0 3202 // First 1-D idct: first 8 rows
michael@0 3203 // Load input data.
michael@0 3204 LOAD_DQCOEFF(in0, input);
michael@0 3205 LOAD_DQCOEFF(in8, input);
michael@0 3206 LOAD_DQCOEFF(in16, input);
michael@0 3207 LOAD_DQCOEFF(in24, input);
michael@0 3208 LOAD_DQCOEFF(in1, input);
michael@0 3209 LOAD_DQCOEFF(in9, input);
michael@0 3210 LOAD_DQCOEFF(in17, input);
michael@0 3211 LOAD_DQCOEFF(in25, input);
michael@0 3212 LOAD_DQCOEFF(in2, input);
michael@0 3213 LOAD_DQCOEFF(in10, input);
michael@0 3214 LOAD_DQCOEFF(in18, input);
michael@0 3215 LOAD_DQCOEFF(in26, input);
michael@0 3216 LOAD_DQCOEFF(in3, input);
michael@0 3217 LOAD_DQCOEFF(in11, input);
michael@0 3218 LOAD_DQCOEFF(in19, input);
michael@0 3219 LOAD_DQCOEFF(in27, input);
michael@0 3220
michael@0 3221 LOAD_DQCOEFF(in4, input);
michael@0 3222 LOAD_DQCOEFF(in12, input);
michael@0 3223 LOAD_DQCOEFF(in20, input);
michael@0 3224 LOAD_DQCOEFF(in28, input);
michael@0 3225 LOAD_DQCOEFF(in5, input);
michael@0 3226 LOAD_DQCOEFF(in13, input);
michael@0 3227 LOAD_DQCOEFF(in21, input);
michael@0 3228 LOAD_DQCOEFF(in29, input);
michael@0 3229 LOAD_DQCOEFF(in6, input);
michael@0 3230 LOAD_DQCOEFF(in14, input);
michael@0 3231 LOAD_DQCOEFF(in22, input);
michael@0 3232 LOAD_DQCOEFF(in30, input);
michael@0 3233 LOAD_DQCOEFF(in7, input);
michael@0 3234 LOAD_DQCOEFF(in15, input);
michael@0 3235 LOAD_DQCOEFF(in23, input);
michael@0 3236 LOAD_DQCOEFF(in31, input);
michael@0 3237
michael@0 3238 // Transpose 32x8 block to 8x32 block
michael@0 3239 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
michael@0 3240 in4, in5, in6, in7);
michael@0 3241 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
michael@0 3242 in10, in11, in12, in13, in14, in15);
michael@0 3243 TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
michael@0 3244 in18, in19, in20, in21, in22, in23);
michael@0 3245 TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
michael@0 3246 in26, in27, in28, in29, in30, in31);
michael@0 3247 } else if (i < 4) {
michael@0 3248 // First 1-D idct: next 24 zero-coeff rows
michael@0 3249 col[i32 + 0] = _mm_setzero_si128();
michael@0 3250 col[i32 + 1] = _mm_setzero_si128();
michael@0 3251 col[i32 + 2] = _mm_setzero_si128();
michael@0 3252 col[i32 + 3] = _mm_setzero_si128();
michael@0 3253 col[i32 + 4] = _mm_setzero_si128();
michael@0 3254 col[i32 + 5] = _mm_setzero_si128();
michael@0 3255 col[i32 + 6] = _mm_setzero_si128();
michael@0 3256 col[i32 + 7] = _mm_setzero_si128();
michael@0 3257 col[i32 + 8] = _mm_setzero_si128();
michael@0 3258 col[i32 + 9] = _mm_setzero_si128();
michael@0 3259 col[i32 + 10] = _mm_setzero_si128();
michael@0 3260 col[i32 + 11] = _mm_setzero_si128();
michael@0 3261 col[i32 + 12] = _mm_setzero_si128();
michael@0 3262 col[i32 + 13] = _mm_setzero_si128();
michael@0 3263 col[i32 + 14] = _mm_setzero_si128();
michael@0 3264 col[i32 + 15] = _mm_setzero_si128();
michael@0 3265 col[i32 + 16] = _mm_setzero_si128();
michael@0 3266 col[i32 + 17] = _mm_setzero_si128();
michael@0 3267 col[i32 + 18] = _mm_setzero_si128();
michael@0 3268 col[i32 + 19] = _mm_setzero_si128();
michael@0 3269 col[i32 + 20] = _mm_setzero_si128();
michael@0 3270 col[i32 + 21] = _mm_setzero_si128();
michael@0 3271 col[i32 + 22] = _mm_setzero_si128();
michael@0 3272 col[i32 + 23] = _mm_setzero_si128();
michael@0 3273 col[i32 + 24] = _mm_setzero_si128();
michael@0 3274 col[i32 + 25] = _mm_setzero_si128();
michael@0 3275 col[i32 + 26] = _mm_setzero_si128();
michael@0 3276 col[i32 + 27] = _mm_setzero_si128();
michael@0 3277 col[i32 + 28] = _mm_setzero_si128();
michael@0 3278 col[i32 + 29] = _mm_setzero_si128();
michael@0 3279 col[i32 + 30] = _mm_setzero_si128();
michael@0 3280 col[i32 + 31] = _mm_setzero_si128();
michael@0 3281 continue;
michael@0 3282 } else {
michael@0 3283 // Second 1-D idct
michael@0 3284 j = i - 4;
michael@0 3285
michael@0 3286 // Transpose 32x8 block to 8x32 block
michael@0 3287 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
michael@0 3288 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
michael@0 3289 col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
michael@0 3290 in5, in6, in7);
michael@0 3291 j += 4;
michael@0 3292 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
michael@0 3293 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
michael@0 3294 col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
michael@0 3295 in11, in12, in13, in14, in15);
michael@0 3296 j += 4;
michael@0 3297 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
michael@0 3298 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
michael@0 3299 col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
michael@0 3300 in19, in20, in21, in22, in23);
michael@0 3301 j += 4;
michael@0 3302 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
michael@0 3303 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
michael@0 3304 col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
michael@0 3305 in28, in29, in30, in31);
michael@0 3306 }
michael@0 3307
michael@0 3308 IDCT32_1D
michael@0 3309
michael@0 3310 // final stage
michael@0 3311 if (i < 4) {
michael@0 3312 // 1_D: Store 32 intermediate results for each 8x32 block.
michael@0 3313 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
michael@0 3314 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
michael@0 3315 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
michael@0 3316 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
michael@0 3317 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
michael@0 3318 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
michael@0 3319 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
michael@0 3320 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
michael@0 3321 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
michael@0 3322 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
michael@0 3323 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
michael@0 3324 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
michael@0 3325 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
michael@0 3326 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
michael@0 3327 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
michael@0 3328 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
michael@0 3329 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
michael@0 3330 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
michael@0 3331 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
michael@0 3332 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
michael@0 3333 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
michael@0 3334 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
michael@0 3335 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
michael@0 3336 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
michael@0 3337 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
michael@0 3338 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
michael@0 3339 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
michael@0 3340 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
michael@0 3341 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
michael@0 3342 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
michael@0 3343 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
michael@0 3344 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
michael@0 3345 } else {
michael@0 3346 const __m128i zero = _mm_setzero_si128();
michael@0 3347
michael@0 3348 // 2_D: Calculate the results and store them to destination.
michael@0 3349 in0 = _mm_add_epi16(stp1_0, stp1_31);
michael@0 3350 in1 = _mm_add_epi16(stp1_1, stp1_30);
michael@0 3351 in2 = _mm_add_epi16(stp1_2, stp1_29);
michael@0 3352 in3 = _mm_add_epi16(stp1_3, stp1_28);
michael@0 3353 in4 = _mm_add_epi16(stp1_4, stp1_27);
michael@0 3354 in5 = _mm_add_epi16(stp1_5, stp1_26);
michael@0 3355 in6 = _mm_add_epi16(stp1_6, stp1_25);
michael@0 3356 in7 = _mm_add_epi16(stp1_7, stp1_24);
michael@0 3357 in8 = _mm_add_epi16(stp1_8, stp1_23);
michael@0 3358 in9 = _mm_add_epi16(stp1_9, stp1_22);
michael@0 3359 in10 = _mm_add_epi16(stp1_10, stp1_21);
michael@0 3360 in11 = _mm_add_epi16(stp1_11, stp1_20);
michael@0 3361 in12 = _mm_add_epi16(stp1_12, stp1_19);
michael@0 3362 in13 = _mm_add_epi16(stp1_13, stp1_18);
michael@0 3363 in14 = _mm_add_epi16(stp1_14, stp1_17);
michael@0 3364 in15 = _mm_add_epi16(stp1_15, stp1_16);
michael@0 3365 in16 = _mm_sub_epi16(stp1_15, stp1_16);
michael@0 3366 in17 = _mm_sub_epi16(stp1_14, stp1_17);
michael@0 3367 in18 = _mm_sub_epi16(stp1_13, stp1_18);
michael@0 3368 in19 = _mm_sub_epi16(stp1_12, stp1_19);
michael@0 3369 in20 = _mm_sub_epi16(stp1_11, stp1_20);
michael@0 3370 in21 = _mm_sub_epi16(stp1_10, stp1_21);
michael@0 3371 in22 = _mm_sub_epi16(stp1_9, stp1_22);
michael@0 3372 in23 = _mm_sub_epi16(stp1_8, stp1_23);
michael@0 3373 in24 = _mm_sub_epi16(stp1_7, stp1_24);
michael@0 3374 in25 = _mm_sub_epi16(stp1_6, stp1_25);
michael@0 3375 in26 = _mm_sub_epi16(stp1_5, stp1_26);
michael@0 3376 in27 = _mm_sub_epi16(stp1_4, stp1_27);
michael@0 3377 in28 = _mm_sub_epi16(stp1_3, stp1_28);
michael@0 3378 in29 = _mm_sub_epi16(stp1_2, stp1_29);
michael@0 3379 in30 = _mm_sub_epi16(stp1_1, stp1_30);
michael@0 3380 in31 = _mm_sub_epi16(stp1_0, stp1_31);
michael@0 3381
michael@0 3382 // Final rounding and shift
michael@0 3383 in0 = _mm_adds_epi16(in0, final_rounding);
michael@0 3384 in1 = _mm_adds_epi16(in1, final_rounding);
michael@0 3385 in2 = _mm_adds_epi16(in2, final_rounding);
michael@0 3386 in3 = _mm_adds_epi16(in3, final_rounding);
michael@0 3387 in4 = _mm_adds_epi16(in4, final_rounding);
michael@0 3388 in5 = _mm_adds_epi16(in5, final_rounding);
michael@0 3389 in6 = _mm_adds_epi16(in6, final_rounding);
michael@0 3390 in7 = _mm_adds_epi16(in7, final_rounding);
michael@0 3391 in8 = _mm_adds_epi16(in8, final_rounding);
michael@0 3392 in9 = _mm_adds_epi16(in9, final_rounding);
michael@0 3393 in10 = _mm_adds_epi16(in10, final_rounding);
michael@0 3394 in11 = _mm_adds_epi16(in11, final_rounding);
michael@0 3395 in12 = _mm_adds_epi16(in12, final_rounding);
michael@0 3396 in13 = _mm_adds_epi16(in13, final_rounding);
michael@0 3397 in14 = _mm_adds_epi16(in14, final_rounding);
michael@0 3398 in15 = _mm_adds_epi16(in15, final_rounding);
michael@0 3399 in16 = _mm_adds_epi16(in16, final_rounding);
michael@0 3400 in17 = _mm_adds_epi16(in17, final_rounding);
michael@0 3401 in18 = _mm_adds_epi16(in18, final_rounding);
michael@0 3402 in19 = _mm_adds_epi16(in19, final_rounding);
michael@0 3403 in20 = _mm_adds_epi16(in20, final_rounding);
michael@0 3404 in21 = _mm_adds_epi16(in21, final_rounding);
michael@0 3405 in22 = _mm_adds_epi16(in22, final_rounding);
michael@0 3406 in23 = _mm_adds_epi16(in23, final_rounding);
michael@0 3407 in24 = _mm_adds_epi16(in24, final_rounding);
michael@0 3408 in25 = _mm_adds_epi16(in25, final_rounding);
michael@0 3409 in26 = _mm_adds_epi16(in26, final_rounding);
michael@0 3410 in27 = _mm_adds_epi16(in27, final_rounding);
michael@0 3411 in28 = _mm_adds_epi16(in28, final_rounding);
michael@0 3412 in29 = _mm_adds_epi16(in29, final_rounding);
michael@0 3413 in30 = _mm_adds_epi16(in30, final_rounding);
michael@0 3414 in31 = _mm_adds_epi16(in31, final_rounding);
michael@0 3415
michael@0 3416 in0 = _mm_srai_epi16(in0, 6);
michael@0 3417 in1 = _mm_srai_epi16(in1, 6);
michael@0 3418 in2 = _mm_srai_epi16(in2, 6);
michael@0 3419 in3 = _mm_srai_epi16(in3, 6);
michael@0 3420 in4 = _mm_srai_epi16(in4, 6);
michael@0 3421 in5 = _mm_srai_epi16(in5, 6);
michael@0 3422 in6 = _mm_srai_epi16(in6, 6);
michael@0 3423 in7 = _mm_srai_epi16(in7, 6);
michael@0 3424 in8 = _mm_srai_epi16(in8, 6);
michael@0 3425 in9 = _mm_srai_epi16(in9, 6);
michael@0 3426 in10 = _mm_srai_epi16(in10, 6);
michael@0 3427 in11 = _mm_srai_epi16(in11, 6);
michael@0 3428 in12 = _mm_srai_epi16(in12, 6);
michael@0 3429 in13 = _mm_srai_epi16(in13, 6);
michael@0 3430 in14 = _mm_srai_epi16(in14, 6);
michael@0 3431 in15 = _mm_srai_epi16(in15, 6);
michael@0 3432 in16 = _mm_srai_epi16(in16, 6);
michael@0 3433 in17 = _mm_srai_epi16(in17, 6);
michael@0 3434 in18 = _mm_srai_epi16(in18, 6);
michael@0 3435 in19 = _mm_srai_epi16(in19, 6);
michael@0 3436 in20 = _mm_srai_epi16(in20, 6);
michael@0 3437 in21 = _mm_srai_epi16(in21, 6);
michael@0 3438 in22 = _mm_srai_epi16(in22, 6);
michael@0 3439 in23 = _mm_srai_epi16(in23, 6);
michael@0 3440 in24 = _mm_srai_epi16(in24, 6);
michael@0 3441 in25 = _mm_srai_epi16(in25, 6);
michael@0 3442 in26 = _mm_srai_epi16(in26, 6);
michael@0 3443 in27 = _mm_srai_epi16(in27, 6);
michael@0 3444 in28 = _mm_srai_epi16(in28, 6);
michael@0 3445 in29 = _mm_srai_epi16(in29, 6);
michael@0 3446 in30 = _mm_srai_epi16(in30, 6);
michael@0 3447 in31 = _mm_srai_epi16(in31, 6);
michael@0 3448
michael@0 3449 RECON_AND_STORE(dest, in0);
michael@0 3450 RECON_AND_STORE(dest, in1);
michael@0 3451 RECON_AND_STORE(dest, in2);
michael@0 3452 RECON_AND_STORE(dest, in3);
michael@0 3453 RECON_AND_STORE(dest, in4);
michael@0 3454 RECON_AND_STORE(dest, in5);
michael@0 3455 RECON_AND_STORE(dest, in6);
michael@0 3456 RECON_AND_STORE(dest, in7);
michael@0 3457 RECON_AND_STORE(dest, in8);
michael@0 3458 RECON_AND_STORE(dest, in9);
michael@0 3459 RECON_AND_STORE(dest, in10);
michael@0 3460 RECON_AND_STORE(dest, in11);
michael@0 3461 RECON_AND_STORE(dest, in12);
michael@0 3462 RECON_AND_STORE(dest, in13);
michael@0 3463 RECON_AND_STORE(dest, in14);
michael@0 3464 RECON_AND_STORE(dest, in15);
michael@0 3465 RECON_AND_STORE(dest, in16);
michael@0 3466 RECON_AND_STORE(dest, in17);
michael@0 3467 RECON_AND_STORE(dest, in18);
michael@0 3468 RECON_AND_STORE(dest, in19);
michael@0 3469 RECON_AND_STORE(dest, in20);
michael@0 3470 RECON_AND_STORE(dest, in21);
michael@0 3471 RECON_AND_STORE(dest, in22);
michael@0 3472 RECON_AND_STORE(dest, in23);
michael@0 3473 RECON_AND_STORE(dest, in24);
michael@0 3474 RECON_AND_STORE(dest, in25);
michael@0 3475 RECON_AND_STORE(dest, in26);
michael@0 3476 RECON_AND_STORE(dest, in27);
michael@0 3477 RECON_AND_STORE(dest, in28);
michael@0 3478 RECON_AND_STORE(dest, in29);
michael@0 3479 RECON_AND_STORE(dest, in30);
michael@0 3480 RECON_AND_STORE(dest, in31);
michael@0 3481
michael@0 3482 dest += 8 - (stride * 32);
michael@0 3483 }
michael@0 3484 }
michael@0 3485 }
michael@0 3486
michael@0 3487 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest,
michael@0 3488 int stride) {
michael@0 3489 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 3490 const __m128i final_rounding = _mm_set1_epi16(1<<5);
michael@0 3491
michael@0 3492 // idct constants for each stage
michael@0 3493 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
michael@0 3494 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
michael@0 3495 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
michael@0 3496 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
michael@0 3497 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
michael@0 3498 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
michael@0 3499 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
michael@0 3500 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
michael@0 3501 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
michael@0 3502 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
michael@0 3503 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
michael@0 3504 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
michael@0 3505 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
michael@0 3506 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
michael@0 3507 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
michael@0 3508 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
michael@0 3509
michael@0 3510 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
michael@0 3511 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
michael@0 3512 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
michael@0 3513 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
michael@0 3514 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
michael@0 3515 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
michael@0 3516 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
michael@0 3517 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
michael@0 3518
michael@0 3519 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
michael@0 3520 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
michael@0 3521 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
michael@0 3522 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
michael@0 3523 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
michael@0 3524 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
michael@0 3525 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
michael@0 3526 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
michael@0 3527 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
michael@0 3528 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
michael@0 3529
michael@0 3530 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
michael@0 3531 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 3532 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
michael@0 3533 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
michael@0 3534 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
michael@0 3535 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
michael@0 3536 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
michael@0 3537
michael@0 3538 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
michael@0 3539
michael@0 3540 __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
michael@0 3541 in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
michael@0 3542 in24, in25, in26, in27, in28, in29, in30, in31;
michael@0 3543 __m128i col[128];
michael@0 3544 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
michael@0 3545 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
michael@0 3546 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
michael@0 3547 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
michael@0 3548 stp1_30, stp1_31;
michael@0 3549 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
michael@0 3550 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
michael@0 3551 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
michael@0 3552 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
michael@0 3553 stp2_30, stp2_31;
michael@0 3554 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
michael@0 3555 int i, j, i32;
michael@0 3556 __m128i zero_idx[16];
michael@0 3557 int zero_flag[2];
michael@0 3558
michael@0 3559 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
michael@0 3560 for (i = 0; i < 8; i++) {
michael@0 3561 i32 = (i << 5);
michael@0 3562 if (i < 4) {
michael@0 3563 // First 1-D idct
michael@0 3564 // Load input data.
michael@0 3565 LOAD_DQCOEFF(in0, input);
michael@0 3566 LOAD_DQCOEFF(in8, input);
michael@0 3567 LOAD_DQCOEFF(in16, input);
michael@0 3568 LOAD_DQCOEFF(in24, input);
michael@0 3569 LOAD_DQCOEFF(in1, input);
michael@0 3570 LOAD_DQCOEFF(in9, input);
michael@0 3571 LOAD_DQCOEFF(in17, input);
michael@0 3572 LOAD_DQCOEFF(in25, input);
michael@0 3573 LOAD_DQCOEFF(in2, input);
michael@0 3574 LOAD_DQCOEFF(in10, input);
michael@0 3575 LOAD_DQCOEFF(in18, input);
michael@0 3576 LOAD_DQCOEFF(in26, input);
michael@0 3577 LOAD_DQCOEFF(in3, input);
michael@0 3578 LOAD_DQCOEFF(in11, input);
michael@0 3579 LOAD_DQCOEFF(in19, input);
michael@0 3580 LOAD_DQCOEFF(in27, input);
michael@0 3581
michael@0 3582 LOAD_DQCOEFF(in4, input);
michael@0 3583 LOAD_DQCOEFF(in12, input);
michael@0 3584 LOAD_DQCOEFF(in20, input);
michael@0 3585 LOAD_DQCOEFF(in28, input);
michael@0 3586 LOAD_DQCOEFF(in5, input);
michael@0 3587 LOAD_DQCOEFF(in13, input);
michael@0 3588 LOAD_DQCOEFF(in21, input);
michael@0 3589 LOAD_DQCOEFF(in29, input);
michael@0 3590 LOAD_DQCOEFF(in6, input);
michael@0 3591 LOAD_DQCOEFF(in14, input);
michael@0 3592 LOAD_DQCOEFF(in22, input);
michael@0 3593 LOAD_DQCOEFF(in30, input);
michael@0 3594 LOAD_DQCOEFF(in7, input);
michael@0 3595 LOAD_DQCOEFF(in15, input);
michael@0 3596 LOAD_DQCOEFF(in23, input);
michael@0 3597 LOAD_DQCOEFF(in31, input);
michael@0 3598
michael@0 3599 // checking if all entries are zero
michael@0 3600 zero_idx[0] = _mm_or_si128(in0, in1);
michael@0 3601 zero_idx[1] = _mm_or_si128(in2, in3);
michael@0 3602 zero_idx[2] = _mm_or_si128(in4, in5);
michael@0 3603 zero_idx[3] = _mm_or_si128(in6, in7);
michael@0 3604 zero_idx[4] = _mm_or_si128(in8, in9);
michael@0 3605 zero_idx[5] = _mm_or_si128(in10, in11);
michael@0 3606 zero_idx[6] = _mm_or_si128(in12, in13);
michael@0 3607 zero_idx[7] = _mm_or_si128(in14, in15);
michael@0 3608 zero_idx[8] = _mm_or_si128(in16, in17);
michael@0 3609 zero_idx[9] = _mm_or_si128(in18, in19);
michael@0 3610 zero_idx[10] = _mm_or_si128(in20, in21);
michael@0 3611 zero_idx[11] = _mm_or_si128(in22, in23);
michael@0 3612 zero_idx[12] = _mm_or_si128(in24, in25);
michael@0 3613 zero_idx[13] = _mm_or_si128(in26, in27);
michael@0 3614 zero_idx[14] = _mm_or_si128(in28, in29);
michael@0 3615 zero_idx[15] = _mm_or_si128(in30, in31);
michael@0 3616
michael@0 3617 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
michael@0 3618 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
michael@0 3619 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
michael@0 3620 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
michael@0 3621 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
michael@0 3622 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
michael@0 3623 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
michael@0 3624 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
michael@0 3625
michael@0 3626 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
michael@0 3627 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
michael@0 3628 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
michael@0 3629 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
michael@0 3630 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
michael@0 3631 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
michael@0 3632 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
michael@0 3633
michael@0 3634 zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]);
michael@0 3635 zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]);
michael@0 3636 zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32);
michael@0 3637 zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]);
michael@0 3638 zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]);
michael@0 3639
michael@0 3640 if (!zero_flag[0] && !zero_flag[1]) {
michael@0 3641 col[i32 + 0] = _mm_setzero_si128();
michael@0 3642 col[i32 + 1] = _mm_setzero_si128();
michael@0 3643 col[i32 + 2] = _mm_setzero_si128();
michael@0 3644 col[i32 + 3] = _mm_setzero_si128();
michael@0 3645 col[i32 + 4] = _mm_setzero_si128();
michael@0 3646 col[i32 + 5] = _mm_setzero_si128();
michael@0 3647 col[i32 + 6] = _mm_setzero_si128();
michael@0 3648 col[i32 + 7] = _mm_setzero_si128();
michael@0 3649 col[i32 + 8] = _mm_setzero_si128();
michael@0 3650 col[i32 + 9] = _mm_setzero_si128();
michael@0 3651 col[i32 + 10] = _mm_setzero_si128();
michael@0 3652 col[i32 + 11] = _mm_setzero_si128();
michael@0 3653 col[i32 + 12] = _mm_setzero_si128();
michael@0 3654 col[i32 + 13] = _mm_setzero_si128();
michael@0 3655 col[i32 + 14] = _mm_setzero_si128();
michael@0 3656 col[i32 + 15] = _mm_setzero_si128();
michael@0 3657 col[i32 + 16] = _mm_setzero_si128();
michael@0 3658 col[i32 + 17] = _mm_setzero_si128();
michael@0 3659 col[i32 + 18] = _mm_setzero_si128();
michael@0 3660 col[i32 + 19] = _mm_setzero_si128();
michael@0 3661 col[i32 + 20] = _mm_setzero_si128();
michael@0 3662 col[i32 + 21] = _mm_setzero_si128();
michael@0 3663 col[i32 + 22] = _mm_setzero_si128();
michael@0 3664 col[i32 + 23] = _mm_setzero_si128();
michael@0 3665 col[i32 + 24] = _mm_setzero_si128();
michael@0 3666 col[i32 + 25] = _mm_setzero_si128();
michael@0 3667 col[i32 + 26] = _mm_setzero_si128();
michael@0 3668 col[i32 + 27] = _mm_setzero_si128();
michael@0 3669 col[i32 + 28] = _mm_setzero_si128();
michael@0 3670 col[i32 + 29] = _mm_setzero_si128();
michael@0 3671 col[i32 + 30] = _mm_setzero_si128();
michael@0 3672 col[i32 + 31] = _mm_setzero_si128();
michael@0 3673 continue;
michael@0 3674 }
michael@0 3675
michael@0 3676 // Transpose 32x8 block to 8x32 block
michael@0 3677 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
michael@0 3678 in4, in5, in6, in7);
michael@0 3679 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
michael@0 3680 in10, in11, in12, in13, in14, in15);
michael@0 3681 TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
michael@0 3682 in18, in19, in20, in21, in22, in23);
michael@0 3683 TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
michael@0 3684 in26, in27, in28, in29, in30, in31);
michael@0 3685 } else {
michael@0 3686 // Second 1-D idct
michael@0 3687 j = i - 4;
michael@0 3688
michael@0 3689 // Transpose 32x8 block to 8x32 block
michael@0 3690 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
michael@0 3691 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
michael@0 3692 col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
michael@0 3693 in5, in6, in7);
michael@0 3694 j += 4;
michael@0 3695 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
michael@0 3696 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
michael@0 3697 col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
michael@0 3698 in11, in12, in13, in14, in15);
michael@0 3699 j += 4;
michael@0 3700 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
michael@0 3701 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
michael@0 3702 col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
michael@0 3703 in19, in20, in21, in22, in23);
michael@0 3704 j += 4;
michael@0 3705 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
michael@0 3706 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
michael@0 3707 col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
michael@0 3708 in28, in29, in30, in31);
michael@0 3709 }
michael@0 3710
michael@0 3711 IDCT32_1D
michael@0 3712
michael@0 3713 // final stage
michael@0 3714 if (i < 4) {
michael@0 3715 // 1_D: Store 32 intermediate results for each 8x32 block.
michael@0 3716 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
michael@0 3717 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
michael@0 3718 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
michael@0 3719 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
michael@0 3720 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
michael@0 3721 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
michael@0 3722 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
michael@0 3723 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
michael@0 3724 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
michael@0 3725 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
michael@0 3726 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
michael@0 3727 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
michael@0 3728 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
michael@0 3729 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
michael@0 3730 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
michael@0 3731 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
michael@0 3732 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
michael@0 3733 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
michael@0 3734 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
michael@0 3735 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
michael@0 3736 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
michael@0 3737 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
michael@0 3738 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
michael@0 3739 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
michael@0 3740 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
michael@0 3741 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
michael@0 3742 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
michael@0 3743 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
michael@0 3744 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
michael@0 3745 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
michael@0 3746 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
michael@0 3747 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
michael@0 3748 } else {
michael@0 3749 const __m128i zero = _mm_setzero_si128();
michael@0 3750
michael@0 3751 // 2_D: Calculate the results and store them to destination.
michael@0 3752 in0 = _mm_add_epi16(stp1_0, stp1_31);
michael@0 3753 in1 = _mm_add_epi16(stp1_1, stp1_30);
michael@0 3754 in2 = _mm_add_epi16(stp1_2, stp1_29);
michael@0 3755 in3 = _mm_add_epi16(stp1_3, stp1_28);
michael@0 3756 in4 = _mm_add_epi16(stp1_4, stp1_27);
michael@0 3757 in5 = _mm_add_epi16(stp1_5, stp1_26);
michael@0 3758 in6 = _mm_add_epi16(stp1_6, stp1_25);
michael@0 3759 in7 = _mm_add_epi16(stp1_7, stp1_24);
michael@0 3760 in8 = _mm_add_epi16(stp1_8, stp1_23);
michael@0 3761 in9 = _mm_add_epi16(stp1_9, stp1_22);
michael@0 3762 in10 = _mm_add_epi16(stp1_10, stp1_21);
michael@0 3763 in11 = _mm_add_epi16(stp1_11, stp1_20);
michael@0 3764 in12 = _mm_add_epi16(stp1_12, stp1_19);
michael@0 3765 in13 = _mm_add_epi16(stp1_13, stp1_18);
michael@0 3766 in14 = _mm_add_epi16(stp1_14, stp1_17);
michael@0 3767 in15 = _mm_add_epi16(stp1_15, stp1_16);
michael@0 3768 in16 = _mm_sub_epi16(stp1_15, stp1_16);
michael@0 3769 in17 = _mm_sub_epi16(stp1_14, stp1_17);
michael@0 3770 in18 = _mm_sub_epi16(stp1_13, stp1_18);
michael@0 3771 in19 = _mm_sub_epi16(stp1_12, stp1_19);
michael@0 3772 in20 = _mm_sub_epi16(stp1_11, stp1_20);
michael@0 3773 in21 = _mm_sub_epi16(stp1_10, stp1_21);
michael@0 3774 in22 = _mm_sub_epi16(stp1_9, stp1_22);
michael@0 3775 in23 = _mm_sub_epi16(stp1_8, stp1_23);
michael@0 3776 in24 = _mm_sub_epi16(stp1_7, stp1_24);
michael@0 3777 in25 = _mm_sub_epi16(stp1_6, stp1_25);
michael@0 3778 in26 = _mm_sub_epi16(stp1_5, stp1_26);
michael@0 3779 in27 = _mm_sub_epi16(stp1_4, stp1_27);
michael@0 3780 in28 = _mm_sub_epi16(stp1_3, stp1_28);
michael@0 3781 in29 = _mm_sub_epi16(stp1_2, stp1_29);
michael@0 3782 in30 = _mm_sub_epi16(stp1_1, stp1_30);
michael@0 3783 in31 = _mm_sub_epi16(stp1_0, stp1_31);
michael@0 3784
michael@0 3785 // Final rounding and shift
michael@0 3786 in0 = _mm_adds_epi16(in0, final_rounding);
michael@0 3787 in1 = _mm_adds_epi16(in1, final_rounding);
michael@0 3788 in2 = _mm_adds_epi16(in2, final_rounding);
michael@0 3789 in3 = _mm_adds_epi16(in3, final_rounding);
michael@0 3790 in4 = _mm_adds_epi16(in4, final_rounding);
michael@0 3791 in5 = _mm_adds_epi16(in5, final_rounding);
michael@0 3792 in6 = _mm_adds_epi16(in6, final_rounding);
michael@0 3793 in7 = _mm_adds_epi16(in7, final_rounding);
michael@0 3794 in8 = _mm_adds_epi16(in8, final_rounding);
michael@0 3795 in9 = _mm_adds_epi16(in9, final_rounding);
michael@0 3796 in10 = _mm_adds_epi16(in10, final_rounding);
michael@0 3797 in11 = _mm_adds_epi16(in11, final_rounding);
michael@0 3798 in12 = _mm_adds_epi16(in12, final_rounding);
michael@0 3799 in13 = _mm_adds_epi16(in13, final_rounding);
michael@0 3800 in14 = _mm_adds_epi16(in14, final_rounding);
michael@0 3801 in15 = _mm_adds_epi16(in15, final_rounding);
michael@0 3802 in16 = _mm_adds_epi16(in16, final_rounding);
michael@0 3803 in17 = _mm_adds_epi16(in17, final_rounding);
michael@0 3804 in18 = _mm_adds_epi16(in18, final_rounding);
michael@0 3805 in19 = _mm_adds_epi16(in19, final_rounding);
michael@0 3806 in20 = _mm_adds_epi16(in20, final_rounding);
michael@0 3807 in21 = _mm_adds_epi16(in21, final_rounding);
michael@0 3808 in22 = _mm_adds_epi16(in22, final_rounding);
michael@0 3809 in23 = _mm_adds_epi16(in23, final_rounding);
michael@0 3810 in24 = _mm_adds_epi16(in24, final_rounding);
michael@0 3811 in25 = _mm_adds_epi16(in25, final_rounding);
michael@0 3812 in26 = _mm_adds_epi16(in26, final_rounding);
michael@0 3813 in27 = _mm_adds_epi16(in27, final_rounding);
michael@0 3814 in28 = _mm_adds_epi16(in28, final_rounding);
michael@0 3815 in29 = _mm_adds_epi16(in29, final_rounding);
michael@0 3816 in30 = _mm_adds_epi16(in30, final_rounding);
michael@0 3817 in31 = _mm_adds_epi16(in31, final_rounding);
michael@0 3818
michael@0 3819 in0 = _mm_srai_epi16(in0, 6);
michael@0 3820 in1 = _mm_srai_epi16(in1, 6);
michael@0 3821 in2 = _mm_srai_epi16(in2, 6);
michael@0 3822 in3 = _mm_srai_epi16(in3, 6);
michael@0 3823 in4 = _mm_srai_epi16(in4, 6);
michael@0 3824 in5 = _mm_srai_epi16(in5, 6);
michael@0 3825 in6 = _mm_srai_epi16(in6, 6);
michael@0 3826 in7 = _mm_srai_epi16(in7, 6);
michael@0 3827 in8 = _mm_srai_epi16(in8, 6);
michael@0 3828 in9 = _mm_srai_epi16(in9, 6);
michael@0 3829 in10 = _mm_srai_epi16(in10, 6);
michael@0 3830 in11 = _mm_srai_epi16(in11, 6);
michael@0 3831 in12 = _mm_srai_epi16(in12, 6);
michael@0 3832 in13 = _mm_srai_epi16(in13, 6);
michael@0 3833 in14 = _mm_srai_epi16(in14, 6);
michael@0 3834 in15 = _mm_srai_epi16(in15, 6);
michael@0 3835 in16 = _mm_srai_epi16(in16, 6);
michael@0 3836 in17 = _mm_srai_epi16(in17, 6);
michael@0 3837 in18 = _mm_srai_epi16(in18, 6);
michael@0 3838 in19 = _mm_srai_epi16(in19, 6);
michael@0 3839 in20 = _mm_srai_epi16(in20, 6);
michael@0 3840 in21 = _mm_srai_epi16(in21, 6);
michael@0 3841 in22 = _mm_srai_epi16(in22, 6);
michael@0 3842 in23 = _mm_srai_epi16(in23, 6);
michael@0 3843 in24 = _mm_srai_epi16(in24, 6);
michael@0 3844 in25 = _mm_srai_epi16(in25, 6);
michael@0 3845 in26 = _mm_srai_epi16(in26, 6);
michael@0 3846 in27 = _mm_srai_epi16(in27, 6);
michael@0 3847 in28 = _mm_srai_epi16(in28, 6);
michael@0 3848 in29 = _mm_srai_epi16(in29, 6);
michael@0 3849 in30 = _mm_srai_epi16(in30, 6);
michael@0 3850 in31 = _mm_srai_epi16(in31, 6);
michael@0 3851
michael@0 3852 RECON_AND_STORE(dest, in0);
michael@0 3853 RECON_AND_STORE(dest, in1);
michael@0 3854 RECON_AND_STORE(dest, in2);
michael@0 3855 RECON_AND_STORE(dest, in3);
michael@0 3856 RECON_AND_STORE(dest, in4);
michael@0 3857 RECON_AND_STORE(dest, in5);
michael@0 3858 RECON_AND_STORE(dest, in6);
michael@0 3859 RECON_AND_STORE(dest, in7);
michael@0 3860 RECON_AND_STORE(dest, in8);
michael@0 3861 RECON_AND_STORE(dest, in9);
michael@0 3862 RECON_AND_STORE(dest, in10);
michael@0 3863 RECON_AND_STORE(dest, in11);
michael@0 3864 RECON_AND_STORE(dest, in12);
michael@0 3865 RECON_AND_STORE(dest, in13);
michael@0 3866 RECON_AND_STORE(dest, in14);
michael@0 3867 RECON_AND_STORE(dest, in15);
michael@0 3868 RECON_AND_STORE(dest, in16);
michael@0 3869 RECON_AND_STORE(dest, in17);
michael@0 3870 RECON_AND_STORE(dest, in18);
michael@0 3871 RECON_AND_STORE(dest, in19);
michael@0 3872 RECON_AND_STORE(dest, in20);
michael@0 3873 RECON_AND_STORE(dest, in21);
michael@0 3874 RECON_AND_STORE(dest, in22);
michael@0 3875 RECON_AND_STORE(dest, in23);
michael@0 3876 RECON_AND_STORE(dest, in24);
michael@0 3877 RECON_AND_STORE(dest, in25);
michael@0 3878 RECON_AND_STORE(dest, in26);
michael@0 3879 RECON_AND_STORE(dest, in27);
michael@0 3880 RECON_AND_STORE(dest, in28);
michael@0 3881 RECON_AND_STORE(dest, in29);
michael@0 3882 RECON_AND_STORE(dest, in30);
michael@0 3883 RECON_AND_STORE(dest, in31);
michael@0 3884
michael@0 3885 dest += 8 - (stride * 32);
michael@0 3886 }
michael@0 3887 }
michael@0 3888 } //NOLINT
michael@0 3889
michael@0 3890 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) {
michael@0 3891 __m128i dc_value;
michael@0 3892 const __m128i zero = _mm_setzero_si128();
michael@0 3893 int a, i;
michael@0 3894
michael@0 3895 a = dct_const_round_shift(input[0] * cospi_16_64);
michael@0 3896 a = dct_const_round_shift(a * cospi_16_64);
michael@0 3897 a = ROUND_POWER_OF_TWO(a, 6);
michael@0 3898
michael@0 3899 dc_value = _mm_set1_epi16(a);
michael@0 3900
michael@0 3901 for (i = 0; i < 4; ++i) {
michael@0 3902 RECON_AND_STORE(dest, dc_value);
michael@0 3903 RECON_AND_STORE(dest, dc_value);
michael@0 3904 RECON_AND_STORE(dest, dc_value);
michael@0 3905 RECON_AND_STORE(dest, dc_value);
michael@0 3906 RECON_AND_STORE(dest, dc_value);
michael@0 3907 RECON_AND_STORE(dest, dc_value);
michael@0 3908 RECON_AND_STORE(dest, dc_value);
michael@0 3909 RECON_AND_STORE(dest, dc_value);
michael@0 3910 RECON_AND_STORE(dest, dc_value);
michael@0 3911 RECON_AND_STORE(dest, dc_value);
michael@0 3912 RECON_AND_STORE(dest, dc_value);
michael@0 3913 RECON_AND_STORE(dest, dc_value);
michael@0 3914 RECON_AND_STORE(dest, dc_value);
michael@0 3915 RECON_AND_STORE(dest, dc_value);
michael@0 3916 RECON_AND_STORE(dest, dc_value);
michael@0 3917 RECON_AND_STORE(dest, dc_value);
michael@0 3918 RECON_AND_STORE(dest, dc_value);
michael@0 3919 RECON_AND_STORE(dest, dc_value);
michael@0 3920 RECON_AND_STORE(dest, dc_value);
michael@0 3921 RECON_AND_STORE(dest, dc_value);
michael@0 3922 RECON_AND_STORE(dest, dc_value);
michael@0 3923 RECON_AND_STORE(dest, dc_value);
michael@0 3924 RECON_AND_STORE(dest, dc_value);
michael@0 3925 RECON_AND_STORE(dest, dc_value);
michael@0 3926 RECON_AND_STORE(dest, dc_value);
michael@0 3927 RECON_AND_STORE(dest, dc_value);
michael@0 3928 RECON_AND_STORE(dest, dc_value);
michael@0 3929 RECON_AND_STORE(dest, dc_value);
michael@0 3930 RECON_AND_STORE(dest, dc_value);
michael@0 3931 RECON_AND_STORE(dest, dc_value);
michael@0 3932 RECON_AND_STORE(dest, dc_value);
michael@0 3933 RECON_AND_STORE(dest, dc_value);
michael@0 3934 dest += 8 - (stride * 32);
michael@0 3935 }
michael@0 3936 }

mercurial