media/libvpx/vp9/encoder/x86/vp9_dct_sse2.c

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 /*
michael@0 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
michael@0 3 *
michael@0 4 * Use of this source code is governed by a BSD-style license
michael@0 5 * that can be found in the LICENSE file in the root of the source
michael@0 6 * tree. An additional intellectual property rights grant can be found
michael@0 7 * in the file PATENTS. All contributing project authors may
michael@0 8 * be found in the AUTHORS file in the root of the source tree.
michael@0 9 */
michael@0 10
michael@0 11 #include <emmintrin.h> // SSE2
michael@0 12 #include "vp9/common/vp9_idct.h" // for cospi constants
michael@0 13 #include "vpx_ports/mem.h"
michael@0 14
michael@0 15 void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
michael@0 16 // The 2D transform is done with two passes which are actually pretty
michael@0 17 // similar. In the first one, we transform the columns and transpose
michael@0 18 // the results. In the second one, we transform the rows. To achieve that,
michael@0 19 // as the first pass results are transposed, we tranpose the columns (that
michael@0 20 // is the transposed rows) and transpose the results (so that it goes back
michael@0 21 // in normal/row positions).
michael@0 22 int pass;
michael@0 23 // Constants
michael@0 24 // When we use them, in one case, they are all the same. In all others
michael@0 25 // it's a pair of them that we need to repeat four times. This is done
michael@0 26 // by constructing the 32 bit constant corresponding to that pair.
michael@0 27 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
michael@0 28 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 29 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
michael@0 30 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
michael@0 31 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 32 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
michael@0 33 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
michael@0 34 const __m128i kOne = _mm_set1_epi16(1);
michael@0 35 __m128i in0, in1, in2, in3;
michael@0 36 // Load inputs.
michael@0 37 {
michael@0 38 in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
michael@0 39 in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
michael@0 40 in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
michael@0 41 in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
michael@0 42 // x = x << 4
michael@0 43 in0 = _mm_slli_epi16(in0, 4);
michael@0 44 in1 = _mm_slli_epi16(in1, 4);
michael@0 45 in2 = _mm_slli_epi16(in2, 4);
michael@0 46 in3 = _mm_slli_epi16(in3, 4);
michael@0 47 // if (i == 0 && input[0]) input[0] += 1;
michael@0 48 {
michael@0 49 // The mask will only contain wether the first value is zero, all
michael@0 50 // other comparison will fail as something shifted by 4 (above << 4)
michael@0 51 // can never be equal to one. To increment in the non-zero case, we
michael@0 52 // add the mask and one for the first element:
michael@0 53 // - if zero, mask = -1, v = v - 1 + 1 = v
michael@0 54 // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
michael@0 55 __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
michael@0 56 in0 = _mm_add_epi16(in0, mask);
michael@0 57 in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
michael@0 58 }
michael@0 59 }
michael@0 60 // Do the two transform/transpose passes
michael@0 61 for (pass = 0; pass < 2; ++pass) {
michael@0 62 // Transform 1/2: Add/substract
michael@0 63 const __m128i r0 = _mm_add_epi16(in0, in3);
michael@0 64 const __m128i r1 = _mm_add_epi16(in1, in2);
michael@0 65 const __m128i r2 = _mm_sub_epi16(in1, in2);
michael@0 66 const __m128i r3 = _mm_sub_epi16(in0, in3);
michael@0 67 // Transform 1/2: Interleave to do the multiply by constants which gets us
michael@0 68 // into 32 bits.
michael@0 69 const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
michael@0 70 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
michael@0 71 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
michael@0 72 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
michael@0 73 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
michael@0 74 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
michael@0 75 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
michael@0 76 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
michael@0 77 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
michael@0 78 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
michael@0 79 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
michael@0 80 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
michael@0 81 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
michael@0 82 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
michael@0 83 // Combine and transpose
michael@0 84 const __m128i res0 = _mm_packs_epi32(w0, w2);
michael@0 85 const __m128i res1 = _mm_packs_epi32(w4, w6);
michael@0 86 // 00 01 02 03 20 21 22 23
michael@0 87 // 10 11 12 13 30 31 32 33
michael@0 88 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
michael@0 89 const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1);
michael@0 90 // 00 10 01 11 02 12 03 13
michael@0 91 // 20 30 21 31 22 32 23 33
michael@0 92 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
michael@0 93 in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
michael@0 94 // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1
michael@0 95 // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3
michael@0 96 if (0 == pass) {
michael@0 97 // Extract values in the high part for second pass as transform code
michael@0 98 // only uses the first four values.
michael@0 99 in1 = _mm_unpackhi_epi64(in0, in0);
michael@0 100 in3 = _mm_unpackhi_epi64(in2, in2);
michael@0 101 } else {
michael@0 102 // Post-condition output and store it (v + 1) >> 2, taking advantage
michael@0 103 // of the fact 1/3 are stored just after 0/2.
michael@0 104 __m128i out01 = _mm_add_epi16(in0, kOne);
michael@0 105 __m128i out23 = _mm_add_epi16(in2, kOne);
michael@0 106 out01 = _mm_srai_epi16(out01, 2);
michael@0 107 out23 = _mm_srai_epi16(out23, 2);
michael@0 108 _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
michael@0 109 _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
michael@0 110 }
michael@0 111 }
michael@0 112 }
michael@0 113
michael@0 114 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
michael@0 115 int stride) {
michael@0 116 const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
michael@0 117 const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
michael@0 118 __m128i mask;
michael@0 119
michael@0 120 in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
michael@0 121 in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
michael@0 122 in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
michael@0 123 in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
michael@0 124
michael@0 125 in[0] = _mm_slli_epi16(in[0], 4);
michael@0 126 in[1] = _mm_slli_epi16(in[1], 4);
michael@0 127 in[2] = _mm_slli_epi16(in[2], 4);
michael@0 128 in[3] = _mm_slli_epi16(in[3], 4);
michael@0 129
michael@0 130 mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
michael@0 131 in[0] = _mm_add_epi16(in[0], mask);
michael@0 132 in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
michael@0 133 }
michael@0 134
michael@0 135 static INLINE void write_buffer_4x4(int16_t *output, __m128i *res) {
michael@0 136 const __m128i kOne = _mm_set1_epi16(1);
michael@0 137 __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
michael@0 138 __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
michael@0 139 __m128i out01 = _mm_add_epi16(in01, kOne);
michael@0 140 __m128i out23 = _mm_add_epi16(in23, kOne);
michael@0 141 out01 = _mm_srai_epi16(out01, 2);
michael@0 142 out23 = _mm_srai_epi16(out23, 2);
michael@0 143 _mm_store_si128((__m128i *)(output + 0 * 8), out01);
michael@0 144 _mm_store_si128((__m128i *)(output + 1 * 8), out23);
michael@0 145 }
michael@0 146
michael@0 147 static INLINE void transpose_4x4(__m128i *res) {
michael@0 148 // Combine and transpose
michael@0 149 // 00 01 02 03 20 21 22 23
michael@0 150 // 10 11 12 13 30 31 32 33
michael@0 151 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
michael@0 152 const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
michael@0 153
michael@0 154 // 00 10 01 11 02 12 03 13
michael@0 155 // 20 30 21 31 22 32 23 33
michael@0 156 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
michael@0 157 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
michael@0 158
michael@0 159 // 00 10 20 30 01 11 21 31
michael@0 160 // 02 12 22 32 03 13 23 33
michael@0 161 // only use the first 4 16-bit integers
michael@0 162 res[1] = _mm_unpackhi_epi64(res[0], res[0]);
michael@0 163 res[3] = _mm_unpackhi_epi64(res[2], res[2]);
michael@0 164 }
michael@0 165
michael@0 166 void fdct4_1d_sse2(__m128i *in) {
michael@0 167 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
michael@0 168 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 169 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
michael@0 170 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
michael@0 171 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 172
michael@0 173 __m128i u[4], v[4];
michael@0 174 u[0]=_mm_unpacklo_epi16(in[0], in[1]);
michael@0 175 u[1]=_mm_unpacklo_epi16(in[3], in[2]);
michael@0 176
michael@0 177 v[0] = _mm_add_epi16(u[0], u[1]);
michael@0 178 v[1] = _mm_sub_epi16(u[0], u[1]);
michael@0 179
michael@0 180 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0
michael@0 181 u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2
michael@0 182 u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1
michael@0 183 u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3
michael@0 184
michael@0 185 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
michael@0 186 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
michael@0 187 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
michael@0 188 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
michael@0 189 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
michael@0 190 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
michael@0 191 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
michael@0 192 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
michael@0 193
michael@0 194 in[0] = _mm_packs_epi32(u[0], u[1]);
michael@0 195 in[1] = _mm_packs_epi32(u[2], u[3]);
michael@0 196 transpose_4x4(in);
michael@0 197 }
michael@0 198
michael@0 199 void fadst4_1d_sse2(__m128i *in) {
michael@0 200 const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
michael@0 201 const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
michael@0 202 const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
michael@0 203 const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
michael@0 204 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
michael@0 205 const __m128i kZero = _mm_set1_epi16(0);
michael@0 206 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 207 __m128i u[8], v[8];
michael@0 208 __m128i in7 = _mm_add_epi16(in[0], in[1]);
michael@0 209
michael@0 210 u[0] = _mm_unpacklo_epi16(in[0], in[1]);
michael@0 211 u[1] = _mm_unpacklo_epi16(in[2], in[3]);
michael@0 212 u[2] = _mm_unpacklo_epi16(in7, kZero);
michael@0 213 u[3] = _mm_unpacklo_epi16(in[2], kZero);
michael@0 214 u[4] = _mm_unpacklo_epi16(in[3], kZero);
michael@0 215
michael@0 216 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2
michael@0 217 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5
michael@0 218 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1
michael@0 219 v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3
michael@0 220 v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6
michael@0 221 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4
michael@0 222 v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
michael@0 223
michael@0 224 u[0] = _mm_add_epi32(v[0], v[1]);
michael@0 225 u[1] = _mm_sub_epi32(v[2], v[6]);
michael@0 226 u[2] = _mm_add_epi32(v[3], v[4]);
michael@0 227 u[3] = _mm_sub_epi32(u[2], u[0]);
michael@0 228 u[4] = _mm_slli_epi32(v[5], 2);
michael@0 229 u[5] = _mm_sub_epi32(u[4], v[5]);
michael@0 230 u[6] = _mm_add_epi32(u[3], u[5]);
michael@0 231
michael@0 232 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
michael@0 233 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
michael@0 234 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
michael@0 235 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
michael@0 236
michael@0 237 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
michael@0 238 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
michael@0 239 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
michael@0 240 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
michael@0 241
michael@0 242 in[0] = _mm_packs_epi32(u[0], u[2]);
michael@0 243 in[1] = _mm_packs_epi32(u[1], u[3]);
michael@0 244 transpose_4x4(in);
michael@0 245 }
michael@0 246
michael@0 247 void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output,
michael@0 248 int stride, int tx_type) {
michael@0 249 __m128i in[4];
michael@0 250 load_buffer_4x4(input, in, stride);
michael@0 251 switch (tx_type) {
michael@0 252 case 0: // DCT_DCT
michael@0 253 fdct4_1d_sse2(in);
michael@0 254 fdct4_1d_sse2(in);
michael@0 255 break;
michael@0 256 case 1: // ADST_DCT
michael@0 257 fadst4_1d_sse2(in);
michael@0 258 fdct4_1d_sse2(in);
michael@0 259 break;
michael@0 260 case 2: // DCT_ADST
michael@0 261 fdct4_1d_sse2(in);
michael@0 262 fadst4_1d_sse2(in);
michael@0 263 break;
michael@0 264 case 3: // ADST_ADST
michael@0 265 fadst4_1d_sse2(in);
michael@0 266 fadst4_1d_sse2(in);
michael@0 267 break;
michael@0 268 default:
michael@0 269 assert(0);
michael@0 270 break;
michael@0 271 }
michael@0 272 write_buffer_4x4(output, in);
michael@0 273 }
michael@0 274
michael@0 275 void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
michael@0 276 int pass;
michael@0 277 // Constants
michael@0 278 // When we use them, in one case, they are all the same. In all others
michael@0 279 // it's a pair of them that we need to repeat four times. This is done
michael@0 280 // by constructing the 32 bit constant corresponding to that pair.
michael@0 281 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
michael@0 282 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 283 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
michael@0 284 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
michael@0 285 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
michael@0 286 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
michael@0 287 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
michael@0 288 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
michael@0 289 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 290 // Load input
michael@0 291 __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
michael@0 292 __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
michael@0 293 __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
michael@0 294 __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
michael@0 295 __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
michael@0 296 __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
michael@0 297 __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
michael@0 298 __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
michael@0 299 // Pre-condition input (shift by two)
michael@0 300 in0 = _mm_slli_epi16(in0, 2);
michael@0 301 in1 = _mm_slli_epi16(in1, 2);
michael@0 302 in2 = _mm_slli_epi16(in2, 2);
michael@0 303 in3 = _mm_slli_epi16(in3, 2);
michael@0 304 in4 = _mm_slli_epi16(in4, 2);
michael@0 305 in5 = _mm_slli_epi16(in5, 2);
michael@0 306 in6 = _mm_slli_epi16(in6, 2);
michael@0 307 in7 = _mm_slli_epi16(in7, 2);
michael@0 308
michael@0 309 // We do two passes, first the columns, then the rows. The results of the
michael@0 310 // first pass are transposed so that the same column code can be reused. The
michael@0 311 // results of the second pass are also transposed so that the rows (processed
michael@0 312 // as columns) are put back in row positions.
michael@0 313 for (pass = 0; pass < 2; pass++) {
michael@0 314 // To store results of each pass before the transpose.
michael@0 315 __m128i res0, res1, res2, res3, res4, res5, res6, res7;
michael@0 316 // Add/substract
michael@0 317 const __m128i q0 = _mm_add_epi16(in0, in7);
michael@0 318 const __m128i q1 = _mm_add_epi16(in1, in6);
michael@0 319 const __m128i q2 = _mm_add_epi16(in2, in5);
michael@0 320 const __m128i q3 = _mm_add_epi16(in3, in4);
michael@0 321 const __m128i q4 = _mm_sub_epi16(in3, in4);
michael@0 322 const __m128i q5 = _mm_sub_epi16(in2, in5);
michael@0 323 const __m128i q6 = _mm_sub_epi16(in1, in6);
michael@0 324 const __m128i q7 = _mm_sub_epi16(in0, in7);
michael@0 325 // Work on first four results
michael@0 326 {
michael@0 327 // Add/substract
michael@0 328 const __m128i r0 = _mm_add_epi16(q0, q3);
michael@0 329 const __m128i r1 = _mm_add_epi16(q1, q2);
michael@0 330 const __m128i r2 = _mm_sub_epi16(q1, q2);
michael@0 331 const __m128i r3 = _mm_sub_epi16(q0, q3);
michael@0 332 // Interleave to do the multiply by constants which gets us into 32bits
michael@0 333 const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
michael@0 334 const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
michael@0 335 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
michael@0 336 const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
michael@0 337 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
michael@0 338 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
michael@0 339 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
michael@0 340 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
michael@0 341 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
michael@0 342 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
michael@0 343 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
michael@0 344 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
michael@0 345 // dct_const_round_shift
michael@0 346 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
michael@0 347 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
michael@0 348 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
michael@0 349 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
michael@0 350 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
michael@0 351 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
michael@0 352 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
michael@0 353 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
michael@0 354 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
michael@0 355 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
michael@0 356 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
michael@0 357 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
michael@0 358 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
michael@0 359 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
michael@0 360 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
michael@0 361 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
michael@0 362 // Combine
michael@0 363 res0 = _mm_packs_epi32(w0, w1);
michael@0 364 res4 = _mm_packs_epi32(w2, w3);
michael@0 365 res2 = _mm_packs_epi32(w4, w5);
michael@0 366 res6 = _mm_packs_epi32(w6, w7);
michael@0 367 }
michael@0 368 // Work on next four results
michael@0 369 {
michael@0 370 // Interleave to do the multiply by constants which gets us into 32bits
michael@0 371 const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
michael@0 372 const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
michael@0 373 const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
michael@0 374 const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
michael@0 375 const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
michael@0 376 const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
michael@0 377 // dct_const_round_shift
michael@0 378 const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
michael@0 379 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
michael@0 380 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
michael@0 381 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
michael@0 382 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
michael@0 383 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
michael@0 384 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
michael@0 385 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
michael@0 386 // Combine
michael@0 387 const __m128i r0 = _mm_packs_epi32(s0, s1);
michael@0 388 const __m128i r1 = _mm_packs_epi32(s2, s3);
michael@0 389 // Add/substract
michael@0 390 const __m128i x0 = _mm_add_epi16(q4, r0);
michael@0 391 const __m128i x1 = _mm_sub_epi16(q4, r0);
michael@0 392 const __m128i x2 = _mm_sub_epi16(q7, r1);
michael@0 393 const __m128i x3 = _mm_add_epi16(q7, r1);
michael@0 394 // Interleave to do the multiply by constants which gets us into 32bits
michael@0 395 const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
michael@0 396 const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
michael@0 397 const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
michael@0 398 const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
michael@0 399 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
michael@0 400 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
michael@0 401 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
michael@0 402 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
michael@0 403 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
michael@0 404 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
michael@0 405 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
michael@0 406 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
michael@0 407 // dct_const_round_shift
michael@0 408 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
michael@0 409 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
michael@0 410 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
michael@0 411 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
michael@0 412 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
michael@0 413 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
michael@0 414 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
michael@0 415 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
michael@0 416 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
michael@0 417 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
michael@0 418 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
michael@0 419 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
michael@0 420 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
michael@0 421 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
michael@0 422 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
michael@0 423 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
michael@0 424 // Combine
michael@0 425 res1 = _mm_packs_epi32(w0, w1);
michael@0 426 res7 = _mm_packs_epi32(w2, w3);
michael@0 427 res5 = _mm_packs_epi32(w4, w5);
michael@0 428 res3 = _mm_packs_epi32(w6, w7);
michael@0 429 }
michael@0 430 // Transpose the 8x8.
michael@0 431 {
michael@0 432 // 00 01 02 03 04 05 06 07
michael@0 433 // 10 11 12 13 14 15 16 17
michael@0 434 // 20 21 22 23 24 25 26 27
michael@0 435 // 30 31 32 33 34 35 36 37
michael@0 436 // 40 41 42 43 44 45 46 47
michael@0 437 // 50 51 52 53 54 55 56 57
michael@0 438 // 60 61 62 63 64 65 66 67
michael@0 439 // 70 71 72 73 74 75 76 77
michael@0 440 const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
michael@0 441 const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
michael@0 442 const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
michael@0 443 const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
michael@0 444 const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
michael@0 445 const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
michael@0 446 const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
michael@0 447 const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
michael@0 448 // 00 10 01 11 02 12 03 13
michael@0 449 // 20 30 21 31 22 32 23 33
michael@0 450 // 04 14 05 15 06 16 07 17
michael@0 451 // 24 34 25 35 26 36 27 37
michael@0 452 // 40 50 41 51 42 52 43 53
michael@0 453 // 60 70 61 71 62 72 63 73
michael@0 454 // 54 54 55 55 56 56 57 57
michael@0 455 // 64 74 65 75 66 76 67 77
michael@0 456 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
michael@0 457 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
michael@0 458 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
michael@0 459 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
michael@0 460 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
michael@0 461 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
michael@0 462 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
michael@0 463 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
michael@0 464 // 00 10 20 30 01 11 21 31
michael@0 465 // 40 50 60 70 41 51 61 71
michael@0 466 // 02 12 22 32 03 13 23 33
michael@0 467 // 42 52 62 72 43 53 63 73
michael@0 468 // 04 14 24 34 05 15 21 36
michael@0 469 // 44 54 64 74 45 55 61 76
michael@0 470 // 06 16 26 36 07 17 27 37
michael@0 471 // 46 56 66 76 47 57 67 77
michael@0 472 in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
michael@0 473 in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
michael@0 474 in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
michael@0 475 in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
michael@0 476 in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
michael@0 477 in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
michael@0 478 in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
michael@0 479 in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
michael@0 480 // 00 10 20 30 40 50 60 70
michael@0 481 // 01 11 21 31 41 51 61 71
michael@0 482 // 02 12 22 32 42 52 62 72
michael@0 483 // 03 13 23 33 43 53 63 73
michael@0 484 // 04 14 24 34 44 54 64 74
michael@0 485 // 05 15 25 35 45 55 65 75
michael@0 486 // 06 16 26 36 46 56 66 76
michael@0 487 // 07 17 27 37 47 57 67 77
michael@0 488 }
michael@0 489 }
michael@0 490 // Post-condition output and store it
michael@0 491 {
michael@0 492 // Post-condition (division by two)
michael@0 493 // division of two 16 bits signed numbers using shifts
michael@0 494 // n / 2 = (n - (n >> 15)) >> 1
michael@0 495 const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
michael@0 496 const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
michael@0 497 const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
michael@0 498 const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
michael@0 499 const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
michael@0 500 const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
michael@0 501 const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
michael@0 502 const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
michael@0 503 in0 = _mm_sub_epi16(in0, sign_in0);
michael@0 504 in1 = _mm_sub_epi16(in1, sign_in1);
michael@0 505 in2 = _mm_sub_epi16(in2, sign_in2);
michael@0 506 in3 = _mm_sub_epi16(in3, sign_in3);
michael@0 507 in4 = _mm_sub_epi16(in4, sign_in4);
michael@0 508 in5 = _mm_sub_epi16(in5, sign_in5);
michael@0 509 in6 = _mm_sub_epi16(in6, sign_in6);
michael@0 510 in7 = _mm_sub_epi16(in7, sign_in7);
michael@0 511 in0 = _mm_srai_epi16(in0, 1);
michael@0 512 in1 = _mm_srai_epi16(in1, 1);
michael@0 513 in2 = _mm_srai_epi16(in2, 1);
michael@0 514 in3 = _mm_srai_epi16(in3, 1);
michael@0 515 in4 = _mm_srai_epi16(in4, 1);
michael@0 516 in5 = _mm_srai_epi16(in5, 1);
michael@0 517 in6 = _mm_srai_epi16(in6, 1);
michael@0 518 in7 = _mm_srai_epi16(in7, 1);
michael@0 519 // store results
michael@0 520 _mm_store_si128((__m128i *)(output + 0 * 8), in0);
michael@0 521 _mm_store_si128((__m128i *)(output + 1 * 8), in1);
michael@0 522 _mm_store_si128((__m128i *)(output + 2 * 8), in2);
michael@0 523 _mm_store_si128((__m128i *)(output + 3 * 8), in3);
michael@0 524 _mm_store_si128((__m128i *)(output + 4 * 8), in4);
michael@0 525 _mm_store_si128((__m128i *)(output + 5 * 8), in5);
michael@0 526 _mm_store_si128((__m128i *)(output + 6 * 8), in6);
michael@0 527 _mm_store_si128((__m128i *)(output + 7 * 8), in7);
michael@0 528 }
michael@0 529 }
michael@0 530
michael@0 531 // load 8x8 array
michael@0 532 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
michael@0 533 int stride) {
michael@0 534 in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
michael@0 535 in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
michael@0 536 in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
michael@0 537 in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
michael@0 538 in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
michael@0 539 in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
michael@0 540 in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
michael@0 541 in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
michael@0 542
michael@0 543 in[0] = _mm_slli_epi16(in[0], 2);
michael@0 544 in[1] = _mm_slli_epi16(in[1], 2);
michael@0 545 in[2] = _mm_slli_epi16(in[2], 2);
michael@0 546 in[3] = _mm_slli_epi16(in[3], 2);
michael@0 547 in[4] = _mm_slli_epi16(in[4], 2);
michael@0 548 in[5] = _mm_slli_epi16(in[5], 2);
michael@0 549 in[6] = _mm_slli_epi16(in[6], 2);
michael@0 550 in[7] = _mm_slli_epi16(in[7], 2);
michael@0 551 }
michael@0 552
michael@0 553 // right shift and rounding
michael@0 554 static INLINE void right_shift_8x8(__m128i *res, int const bit) {
michael@0 555 const __m128i kOne = _mm_set1_epi16(1);
michael@0 556 const int bit_m02 = bit - 2;
michael@0 557 __m128i sign0 = _mm_srai_epi16(res[0], 15);
michael@0 558 __m128i sign1 = _mm_srai_epi16(res[1], 15);
michael@0 559 __m128i sign2 = _mm_srai_epi16(res[2], 15);
michael@0 560 __m128i sign3 = _mm_srai_epi16(res[3], 15);
michael@0 561 __m128i sign4 = _mm_srai_epi16(res[4], 15);
michael@0 562 __m128i sign5 = _mm_srai_epi16(res[5], 15);
michael@0 563 __m128i sign6 = _mm_srai_epi16(res[6], 15);
michael@0 564 __m128i sign7 = _mm_srai_epi16(res[7], 15);
michael@0 565
michael@0 566 if (bit_m02 >= 0) {
michael@0 567 __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
michael@0 568 res[0] = _mm_add_epi16(res[0], k_const_rounding);
michael@0 569 res[1] = _mm_add_epi16(res[1], k_const_rounding);
michael@0 570 res[2] = _mm_add_epi16(res[2], k_const_rounding);
michael@0 571 res[3] = _mm_add_epi16(res[3], k_const_rounding);
michael@0 572 res[4] = _mm_add_epi16(res[4], k_const_rounding);
michael@0 573 res[5] = _mm_add_epi16(res[5], k_const_rounding);
michael@0 574 res[6] = _mm_add_epi16(res[6], k_const_rounding);
michael@0 575 res[7] = _mm_add_epi16(res[7], k_const_rounding);
michael@0 576 }
michael@0 577
michael@0 578 res[0] = _mm_sub_epi16(res[0], sign0);
michael@0 579 res[1] = _mm_sub_epi16(res[1], sign1);
michael@0 580 res[2] = _mm_sub_epi16(res[2], sign2);
michael@0 581 res[3] = _mm_sub_epi16(res[3], sign3);
michael@0 582 res[4] = _mm_sub_epi16(res[4], sign4);
michael@0 583 res[5] = _mm_sub_epi16(res[5], sign5);
michael@0 584 res[6] = _mm_sub_epi16(res[6], sign6);
michael@0 585 res[7] = _mm_sub_epi16(res[7], sign7);
michael@0 586
michael@0 587 res[0] = _mm_srai_epi16(res[0], bit);
michael@0 588 res[1] = _mm_srai_epi16(res[1], bit);
michael@0 589 res[2] = _mm_srai_epi16(res[2], bit);
michael@0 590 res[3] = _mm_srai_epi16(res[3], bit);
michael@0 591 res[4] = _mm_srai_epi16(res[4], bit);
michael@0 592 res[5] = _mm_srai_epi16(res[5], bit);
michael@0 593 res[6] = _mm_srai_epi16(res[6], bit);
michael@0 594 res[7] = _mm_srai_epi16(res[7], bit);
michael@0 595 }
michael@0 596
michael@0 597 // write 8x8 array
michael@0 598 static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) {
michael@0 599 _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
michael@0 600 _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
michael@0 601 _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
michael@0 602 _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
michael@0 603 _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
michael@0 604 _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
michael@0 605 _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
michael@0 606 _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
michael@0 607 }
michael@0 608
michael@0 609 // perform in-place transpose
michael@0 610 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
michael@0 611 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
michael@0 612 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
michael@0 613 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
michael@0 614 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
michael@0 615 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
michael@0 616 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
michael@0 617 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
michael@0 618 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
michael@0 619 // 00 10 01 11 02 12 03 13
michael@0 620 // 20 30 21 31 22 32 23 33
michael@0 621 // 04 14 05 15 06 16 07 17
michael@0 622 // 24 34 25 35 26 36 27 37
michael@0 623 // 40 50 41 51 42 52 43 53
michael@0 624 // 60 70 61 71 62 72 63 73
michael@0 625 // 44 54 45 55 46 56 47 57
michael@0 626 // 64 74 65 75 66 76 67 77
michael@0 627 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
michael@0 628 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
michael@0 629 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
michael@0 630 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
michael@0 631 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
michael@0 632 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
michael@0 633 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
michael@0 634 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
michael@0 635 // 00 10 20 30 01 11 21 31
michael@0 636 // 40 50 60 70 41 51 61 71
michael@0 637 // 02 12 22 32 03 13 23 33
michael@0 638 // 42 52 62 72 43 53 63 73
michael@0 639 // 04 14 24 34 05 15 25 35
michael@0 640 // 44 54 64 74 45 55 65 75
michael@0 641 // 06 16 26 36 07 17 27 37
michael@0 642 // 46 56 66 76 47 57 67 77
michael@0 643 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
michael@0 644 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
michael@0 645 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
michael@0 646 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
michael@0 647 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
michael@0 648 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
michael@0 649 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
michael@0 650 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
michael@0 651 // 00 10 20 30 40 50 60 70
michael@0 652 // 01 11 21 31 41 51 61 71
michael@0 653 // 02 12 22 32 42 52 62 72
michael@0 654 // 03 13 23 33 43 53 63 73
michael@0 655 // 04 14 24 34 44 54 64 74
michael@0 656 // 05 15 25 35 45 55 65 75
michael@0 657 // 06 16 26 36 46 56 66 76
michael@0 658 // 07 17 27 37 47 57 67 77
michael@0 659 }
michael@0 660
michael@0 661 void fdct8_1d_sse2(__m128i *in) {
michael@0 662 // constants
michael@0 663 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
michael@0 664 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 665 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
michael@0 666 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
michael@0 667 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
michael@0 668 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
michael@0 669 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
michael@0 670 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
michael@0 671 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 672 __m128i u0, u1, u2, u3, u4, u5, u6, u7;
michael@0 673 __m128i v0, v1, v2, v3, v4, v5, v6, v7;
michael@0 674 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
michael@0 675
michael@0 676 // stage 1
michael@0 677 s0 = _mm_add_epi16(in[0], in[7]);
michael@0 678 s1 = _mm_add_epi16(in[1], in[6]);
michael@0 679 s2 = _mm_add_epi16(in[2], in[5]);
michael@0 680 s3 = _mm_add_epi16(in[3], in[4]);
michael@0 681 s4 = _mm_sub_epi16(in[3], in[4]);
michael@0 682 s5 = _mm_sub_epi16(in[2], in[5]);
michael@0 683 s6 = _mm_sub_epi16(in[1], in[6]);
michael@0 684 s7 = _mm_sub_epi16(in[0], in[7]);
michael@0 685
michael@0 686 u0 = _mm_add_epi16(s0, s3);
michael@0 687 u1 = _mm_add_epi16(s1, s2);
michael@0 688 u2 = _mm_sub_epi16(s1, s2);
michael@0 689 u3 = _mm_sub_epi16(s0, s3);
michael@0 690 // interleave and perform butterfly multiplication/addition
michael@0 691 v0 = _mm_unpacklo_epi16(u0, u1);
michael@0 692 v1 = _mm_unpackhi_epi16(u0, u1);
michael@0 693 v2 = _mm_unpacklo_epi16(u2, u3);
michael@0 694 v3 = _mm_unpackhi_epi16(u2, u3);
michael@0 695
michael@0 696 u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
michael@0 697 u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
michael@0 698 u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
michael@0 699 u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
michael@0 700 u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
michael@0 701 u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
michael@0 702 u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
michael@0 703 u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
michael@0 704
michael@0 705 // shift and rounding
michael@0 706 v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
michael@0 707 v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
michael@0 708 v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
michael@0 709 v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
michael@0 710 v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
michael@0 711 v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
michael@0 712 v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
michael@0 713 v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
michael@0 714
michael@0 715 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
michael@0 716 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
michael@0 717 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
michael@0 718 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
michael@0 719 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
michael@0 720 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
michael@0 721 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
michael@0 722 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
michael@0 723
michael@0 724 in[0] = _mm_packs_epi32(u0, u1);
michael@0 725 in[2] = _mm_packs_epi32(u4, u5);
michael@0 726 in[4] = _mm_packs_epi32(u2, u3);
michael@0 727 in[6] = _mm_packs_epi32(u6, u7);
michael@0 728
michael@0 729 // stage 2
michael@0 730 // interleave and perform butterfly multiplication/addition
michael@0 731 u0 = _mm_unpacklo_epi16(s6, s5);
michael@0 732 u1 = _mm_unpackhi_epi16(s6, s5);
michael@0 733 v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
michael@0 734 v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
michael@0 735 v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
michael@0 736 v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
michael@0 737
michael@0 738 // shift and rounding
michael@0 739 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
michael@0 740 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
michael@0 741 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
michael@0 742 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
michael@0 743
michael@0 744 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
michael@0 745 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
michael@0 746 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
michael@0 747 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
michael@0 748
michael@0 749 u0 = _mm_packs_epi32(v0, v1);
michael@0 750 u1 = _mm_packs_epi32(v2, v3);
michael@0 751
michael@0 752 // stage 3
michael@0 753 s0 = _mm_add_epi16(s4, u0);
michael@0 754 s1 = _mm_sub_epi16(s4, u0);
michael@0 755 s2 = _mm_sub_epi16(s7, u1);
michael@0 756 s3 = _mm_add_epi16(s7, u1);
michael@0 757
michael@0 758 // stage 4
michael@0 759 u0 = _mm_unpacklo_epi16(s0, s3);
michael@0 760 u1 = _mm_unpackhi_epi16(s0, s3);
michael@0 761 u2 = _mm_unpacklo_epi16(s1, s2);
michael@0 762 u3 = _mm_unpackhi_epi16(s1, s2);
michael@0 763
michael@0 764 v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
michael@0 765 v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
michael@0 766 v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
michael@0 767 v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
michael@0 768 v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
michael@0 769 v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
michael@0 770 v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
michael@0 771 v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
michael@0 772
michael@0 773 // shift and rounding
michael@0 774 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
michael@0 775 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
michael@0 776 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
michael@0 777 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
michael@0 778 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
michael@0 779 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
michael@0 780 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
michael@0 781 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
michael@0 782
michael@0 783 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
michael@0 784 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
michael@0 785 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
michael@0 786 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
michael@0 787 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
michael@0 788 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
michael@0 789 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
michael@0 790 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
michael@0 791
michael@0 792 in[1] = _mm_packs_epi32(v0, v1);
michael@0 793 in[3] = _mm_packs_epi32(v4, v5);
michael@0 794 in[5] = _mm_packs_epi32(v2, v3);
michael@0 795 in[7] = _mm_packs_epi32(v6, v7);
michael@0 796
michael@0 797 // transpose
michael@0 798 array_transpose_8x8(in, in);
michael@0 799 }
michael@0 800
michael@0 801 void fadst8_1d_sse2(__m128i *in) {
michael@0 802 // Constants
michael@0 803 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
michael@0 804 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
michael@0 805 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
michael@0 806 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
michael@0 807 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
michael@0 808 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
michael@0 809 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
michael@0 810 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
michael@0 811 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
michael@0 812 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
michael@0 813 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
michael@0 814 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 815 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
michael@0 816 const __m128i k__const_0 = _mm_set1_epi16(0);
michael@0 817 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 818
michael@0 819 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
michael@0 820 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
michael@0 821 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
michael@0 822 __m128i s0, s1, s2, s3, s4, s5, s6, s7;
michael@0 823 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
michael@0 824
michael@0 825 // properly aligned for butterfly input
michael@0 826 in0 = in[7];
michael@0 827 in1 = in[0];
michael@0 828 in2 = in[5];
michael@0 829 in3 = in[2];
michael@0 830 in4 = in[3];
michael@0 831 in5 = in[4];
michael@0 832 in6 = in[1];
michael@0 833 in7 = in[6];
michael@0 834
michael@0 835 // column transformation
michael@0 836 // stage 1
michael@0 837 // interleave and multiply/add into 32-bit integer
michael@0 838 s0 = _mm_unpacklo_epi16(in0, in1);
michael@0 839 s1 = _mm_unpackhi_epi16(in0, in1);
michael@0 840 s2 = _mm_unpacklo_epi16(in2, in3);
michael@0 841 s3 = _mm_unpackhi_epi16(in2, in3);
michael@0 842 s4 = _mm_unpacklo_epi16(in4, in5);
michael@0 843 s5 = _mm_unpackhi_epi16(in4, in5);
michael@0 844 s6 = _mm_unpacklo_epi16(in6, in7);
michael@0 845 s7 = _mm_unpackhi_epi16(in6, in7);
michael@0 846
michael@0 847 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
michael@0 848 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
michael@0 849 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
michael@0 850 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
michael@0 851 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
michael@0 852 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
michael@0 853 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
michael@0 854 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
michael@0 855 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
michael@0 856 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
michael@0 857 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
michael@0 858 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
michael@0 859 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
michael@0 860 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
michael@0 861 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
michael@0 862 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
michael@0 863
michael@0 864 // addition
michael@0 865 w0 = _mm_add_epi32(u0, u8);
michael@0 866 w1 = _mm_add_epi32(u1, u9);
michael@0 867 w2 = _mm_add_epi32(u2, u10);
michael@0 868 w3 = _mm_add_epi32(u3, u11);
michael@0 869 w4 = _mm_add_epi32(u4, u12);
michael@0 870 w5 = _mm_add_epi32(u5, u13);
michael@0 871 w6 = _mm_add_epi32(u6, u14);
michael@0 872 w7 = _mm_add_epi32(u7, u15);
michael@0 873 w8 = _mm_sub_epi32(u0, u8);
michael@0 874 w9 = _mm_sub_epi32(u1, u9);
michael@0 875 w10 = _mm_sub_epi32(u2, u10);
michael@0 876 w11 = _mm_sub_epi32(u3, u11);
michael@0 877 w12 = _mm_sub_epi32(u4, u12);
michael@0 878 w13 = _mm_sub_epi32(u5, u13);
michael@0 879 w14 = _mm_sub_epi32(u6, u14);
michael@0 880 w15 = _mm_sub_epi32(u7, u15);
michael@0 881
michael@0 882 // shift and rounding
michael@0 883 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
michael@0 884 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
michael@0 885 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
michael@0 886 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
michael@0 887 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
michael@0 888 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
michael@0 889 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
michael@0 890 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
michael@0 891 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
michael@0 892 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
michael@0 893 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
michael@0 894 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
michael@0 895 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
michael@0 896 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
michael@0 897 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
michael@0 898 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
michael@0 899
michael@0 900 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
michael@0 901 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
michael@0 902 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
michael@0 903 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
michael@0 904 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
michael@0 905 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
michael@0 906 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
michael@0 907 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
michael@0 908 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
michael@0 909 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
michael@0 910 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
michael@0 911 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
michael@0 912 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
michael@0 913 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
michael@0 914 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
michael@0 915 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
michael@0 916
michael@0 917 // back to 16-bit and pack 8 integers into __m128i
michael@0 918 in[0] = _mm_packs_epi32(u0, u1);
michael@0 919 in[1] = _mm_packs_epi32(u2, u3);
michael@0 920 in[2] = _mm_packs_epi32(u4, u5);
michael@0 921 in[3] = _mm_packs_epi32(u6, u7);
michael@0 922 in[4] = _mm_packs_epi32(u8, u9);
michael@0 923 in[5] = _mm_packs_epi32(u10, u11);
michael@0 924 in[6] = _mm_packs_epi32(u12, u13);
michael@0 925 in[7] = _mm_packs_epi32(u14, u15);
michael@0 926
michael@0 927 // stage 2
michael@0 928 s0 = _mm_add_epi16(in[0], in[2]);
michael@0 929 s1 = _mm_add_epi16(in[1], in[3]);
michael@0 930 s2 = _mm_sub_epi16(in[0], in[2]);
michael@0 931 s3 = _mm_sub_epi16(in[1], in[3]);
michael@0 932 u0 = _mm_unpacklo_epi16(in[4], in[5]);
michael@0 933 u1 = _mm_unpackhi_epi16(in[4], in[5]);
michael@0 934 u2 = _mm_unpacklo_epi16(in[6], in[7]);
michael@0 935 u3 = _mm_unpackhi_epi16(in[6], in[7]);
michael@0 936
michael@0 937 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
michael@0 938 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
michael@0 939 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
michael@0 940 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
michael@0 941 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
michael@0 942 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
michael@0 943 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
michael@0 944 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
michael@0 945
michael@0 946 w0 = _mm_add_epi32(v0, v4);
michael@0 947 w1 = _mm_add_epi32(v1, v5);
michael@0 948 w2 = _mm_add_epi32(v2, v6);
michael@0 949 w3 = _mm_add_epi32(v3, v7);
michael@0 950 w4 = _mm_sub_epi32(v0, v4);
michael@0 951 w5 = _mm_sub_epi32(v1, v5);
michael@0 952 w6 = _mm_sub_epi32(v2, v6);
michael@0 953 w7 = _mm_sub_epi32(v3, v7);
michael@0 954
michael@0 955 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
michael@0 956 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
michael@0 957 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
michael@0 958 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
michael@0 959 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
michael@0 960 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
michael@0 961 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
michael@0 962 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
michael@0 963
michael@0 964 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
michael@0 965 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
michael@0 966 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
michael@0 967 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
michael@0 968 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
michael@0 969 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
michael@0 970 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
michael@0 971 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
michael@0 972
michael@0 973 // back to 16-bit intergers
michael@0 974 s4 = _mm_packs_epi32(u0, u1);
michael@0 975 s5 = _mm_packs_epi32(u2, u3);
michael@0 976 s6 = _mm_packs_epi32(u4, u5);
michael@0 977 s7 = _mm_packs_epi32(u6, u7);
michael@0 978
michael@0 979 // stage 3
michael@0 980 u0 = _mm_unpacklo_epi16(s2, s3);
michael@0 981 u1 = _mm_unpackhi_epi16(s2, s3);
michael@0 982 u2 = _mm_unpacklo_epi16(s6, s7);
michael@0 983 u3 = _mm_unpackhi_epi16(s6, s7);
michael@0 984
michael@0 985 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
michael@0 986 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
michael@0 987 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
michael@0 988 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
michael@0 989 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
michael@0 990 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
michael@0 991 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
michael@0 992 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
michael@0 993
michael@0 994 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
michael@0 995 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
michael@0 996 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
michael@0 997 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
michael@0 998 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
michael@0 999 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
michael@0 1000 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
michael@0 1001 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
michael@0 1002
michael@0 1003 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
michael@0 1004 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
michael@0 1005 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
michael@0 1006 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
michael@0 1007 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
michael@0 1008 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
michael@0 1009 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
michael@0 1010 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
michael@0 1011
michael@0 1012 s2 = _mm_packs_epi32(v0, v1);
michael@0 1013 s3 = _mm_packs_epi32(v2, v3);
michael@0 1014 s6 = _mm_packs_epi32(v4, v5);
michael@0 1015 s7 = _mm_packs_epi32(v6, v7);
michael@0 1016
michael@0 1017 // FIXME(jingning): do subtract using bit inversion?
michael@0 1018 in[0] = s0;
michael@0 1019 in[1] = _mm_sub_epi16(k__const_0, s4);
michael@0 1020 in[2] = s6;
michael@0 1021 in[3] = _mm_sub_epi16(k__const_0, s2);
michael@0 1022 in[4] = s3;
michael@0 1023 in[5] = _mm_sub_epi16(k__const_0, s7);
michael@0 1024 in[6] = s5;
michael@0 1025 in[7] = _mm_sub_epi16(k__const_0, s1);
michael@0 1026
michael@0 1027 // transpose
michael@0 1028 array_transpose_8x8(in, in);
michael@0 1029 }
michael@0 1030
michael@0 1031 void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output,
michael@0 1032 int stride, int tx_type) {
michael@0 1033 __m128i in[8];
michael@0 1034 load_buffer_8x8(input, in, stride);
michael@0 1035 switch (tx_type) {
michael@0 1036 case 0: // DCT_DCT
michael@0 1037 fdct8_1d_sse2(in);
michael@0 1038 fdct8_1d_sse2(in);
michael@0 1039 break;
michael@0 1040 case 1: // ADST_DCT
michael@0 1041 fadst8_1d_sse2(in);
michael@0 1042 fdct8_1d_sse2(in);
michael@0 1043 break;
michael@0 1044 case 2: // DCT_ADST
michael@0 1045 fdct8_1d_sse2(in);
michael@0 1046 fadst8_1d_sse2(in);
michael@0 1047 break;
michael@0 1048 case 3: // ADST_ADST
michael@0 1049 fadst8_1d_sse2(in);
michael@0 1050 fadst8_1d_sse2(in);
michael@0 1051 break;
michael@0 1052 default:
michael@0 1053 assert(0);
michael@0 1054 break;
michael@0 1055 }
michael@0 1056 right_shift_8x8(in, 1);
michael@0 1057 write_buffer_8x8(output, in, 8);
michael@0 1058 }
michael@0 1059
michael@0 1060 void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
michael@0 1061 // The 2D transform is done with two passes which are actually pretty
michael@0 1062 // similar. In the first one, we transform the columns and transpose
michael@0 1063 // the results. In the second one, we transform the rows. To achieve that,
michael@0 1064 // as the first pass results are transposed, we tranpose the columns (that
michael@0 1065 // is the transposed rows) and transpose the results (so that it goes back
michael@0 1066 // in normal/row positions).
michael@0 1067 int pass;
michael@0 1068 // We need an intermediate buffer between passes.
michael@0 1069 DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
michael@0 1070 const int16_t *in = input;
michael@0 1071 int16_t *out = intermediate;
michael@0 1072 // Constants
michael@0 1073 // When we use them, in one case, they are all the same. In all others
michael@0 1074 // it's a pair of them that we need to repeat four times. This is done
michael@0 1075 // by constructing the 32 bit constant corresponding to that pair.
michael@0 1076 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
michael@0 1077 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 1078 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
michael@0 1079 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
michael@0 1080 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
michael@0 1081 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
michael@0 1082 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
michael@0 1083 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
michael@0 1084 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
michael@0 1085 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
michael@0 1086 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
michael@0 1087 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
michael@0 1088 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
michael@0 1089 const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
michael@0 1090 const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
michael@0 1091 const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
michael@0 1092 const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
michael@0 1093 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 1094 const __m128i kOne = _mm_set1_epi16(1);
michael@0 1095 // Do the two transform/transpose passes
michael@0 1096 for (pass = 0; pass < 2; ++pass) {
michael@0 1097 // We process eight columns (transposed rows in second pass) at a time.
michael@0 1098 int column_start;
michael@0 1099 for (column_start = 0; column_start < 16; column_start += 8) {
michael@0 1100 __m128i in00, in01, in02, in03, in04, in05, in06, in07;
michael@0 1101 __m128i in08, in09, in10, in11, in12, in13, in14, in15;
michael@0 1102 __m128i input0, input1, input2, input3, input4, input5, input6, input7;
michael@0 1103 __m128i step1_0, step1_1, step1_2, step1_3;
michael@0 1104 __m128i step1_4, step1_5, step1_6, step1_7;
michael@0 1105 __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
michael@0 1106 __m128i step3_0, step3_1, step3_2, step3_3;
michael@0 1107 __m128i step3_4, step3_5, step3_6, step3_7;
michael@0 1108 __m128i res00, res01, res02, res03, res04, res05, res06, res07;
michael@0 1109 __m128i res08, res09, res10, res11, res12, res13, res14, res15;
michael@0 1110 // Load and pre-condition input.
michael@0 1111 if (0 == pass) {
michael@0 1112 in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
michael@0 1113 in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
michael@0 1114 in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
michael@0 1115 in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
michael@0 1116 in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
michael@0 1117 in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
michael@0 1118 in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
michael@0 1119 in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
michael@0 1120 in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
michael@0 1121 in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
michael@0 1122 in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
michael@0 1123 in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
michael@0 1124 in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
michael@0 1125 in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
michael@0 1126 in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
michael@0 1127 in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
michael@0 1128 // x = x << 2
michael@0 1129 in00 = _mm_slli_epi16(in00, 2);
michael@0 1130 in01 = _mm_slli_epi16(in01, 2);
michael@0 1131 in02 = _mm_slli_epi16(in02, 2);
michael@0 1132 in03 = _mm_slli_epi16(in03, 2);
michael@0 1133 in04 = _mm_slli_epi16(in04, 2);
michael@0 1134 in05 = _mm_slli_epi16(in05, 2);
michael@0 1135 in06 = _mm_slli_epi16(in06, 2);
michael@0 1136 in07 = _mm_slli_epi16(in07, 2);
michael@0 1137 in08 = _mm_slli_epi16(in08, 2);
michael@0 1138 in09 = _mm_slli_epi16(in09, 2);
michael@0 1139 in10 = _mm_slli_epi16(in10, 2);
michael@0 1140 in11 = _mm_slli_epi16(in11, 2);
michael@0 1141 in12 = _mm_slli_epi16(in12, 2);
michael@0 1142 in13 = _mm_slli_epi16(in13, 2);
michael@0 1143 in14 = _mm_slli_epi16(in14, 2);
michael@0 1144 in15 = _mm_slli_epi16(in15, 2);
michael@0 1145 } else {
michael@0 1146 in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
michael@0 1147 in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
michael@0 1148 in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
michael@0 1149 in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
michael@0 1150 in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
michael@0 1151 in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
michael@0 1152 in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
michael@0 1153 in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
michael@0 1154 in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
michael@0 1155 in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
michael@0 1156 in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
michael@0 1157 in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
michael@0 1158 in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
michael@0 1159 in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
michael@0 1160 in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
michael@0 1161 in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
michael@0 1162 // x = (x + 1) >> 2
michael@0 1163 in00 = _mm_add_epi16(in00, kOne);
michael@0 1164 in01 = _mm_add_epi16(in01, kOne);
michael@0 1165 in02 = _mm_add_epi16(in02, kOne);
michael@0 1166 in03 = _mm_add_epi16(in03, kOne);
michael@0 1167 in04 = _mm_add_epi16(in04, kOne);
michael@0 1168 in05 = _mm_add_epi16(in05, kOne);
michael@0 1169 in06 = _mm_add_epi16(in06, kOne);
michael@0 1170 in07 = _mm_add_epi16(in07, kOne);
michael@0 1171 in08 = _mm_add_epi16(in08, kOne);
michael@0 1172 in09 = _mm_add_epi16(in09, kOne);
michael@0 1173 in10 = _mm_add_epi16(in10, kOne);
michael@0 1174 in11 = _mm_add_epi16(in11, kOne);
michael@0 1175 in12 = _mm_add_epi16(in12, kOne);
michael@0 1176 in13 = _mm_add_epi16(in13, kOne);
michael@0 1177 in14 = _mm_add_epi16(in14, kOne);
michael@0 1178 in15 = _mm_add_epi16(in15, kOne);
michael@0 1179 in00 = _mm_srai_epi16(in00, 2);
michael@0 1180 in01 = _mm_srai_epi16(in01, 2);
michael@0 1181 in02 = _mm_srai_epi16(in02, 2);
michael@0 1182 in03 = _mm_srai_epi16(in03, 2);
michael@0 1183 in04 = _mm_srai_epi16(in04, 2);
michael@0 1184 in05 = _mm_srai_epi16(in05, 2);
michael@0 1185 in06 = _mm_srai_epi16(in06, 2);
michael@0 1186 in07 = _mm_srai_epi16(in07, 2);
michael@0 1187 in08 = _mm_srai_epi16(in08, 2);
michael@0 1188 in09 = _mm_srai_epi16(in09, 2);
michael@0 1189 in10 = _mm_srai_epi16(in10, 2);
michael@0 1190 in11 = _mm_srai_epi16(in11, 2);
michael@0 1191 in12 = _mm_srai_epi16(in12, 2);
michael@0 1192 in13 = _mm_srai_epi16(in13, 2);
michael@0 1193 in14 = _mm_srai_epi16(in14, 2);
michael@0 1194 in15 = _mm_srai_epi16(in15, 2);
michael@0 1195 }
michael@0 1196 in += 8;
michael@0 1197 // Calculate input for the first 8 results.
michael@0 1198 {
michael@0 1199 input0 = _mm_add_epi16(in00, in15);
michael@0 1200 input1 = _mm_add_epi16(in01, in14);
michael@0 1201 input2 = _mm_add_epi16(in02, in13);
michael@0 1202 input3 = _mm_add_epi16(in03, in12);
michael@0 1203 input4 = _mm_add_epi16(in04, in11);
michael@0 1204 input5 = _mm_add_epi16(in05, in10);
michael@0 1205 input6 = _mm_add_epi16(in06, in09);
michael@0 1206 input7 = _mm_add_epi16(in07, in08);
michael@0 1207 }
michael@0 1208 // Calculate input for the next 8 results.
michael@0 1209 {
michael@0 1210 step1_0 = _mm_sub_epi16(in07, in08);
michael@0 1211 step1_1 = _mm_sub_epi16(in06, in09);
michael@0 1212 step1_2 = _mm_sub_epi16(in05, in10);
michael@0 1213 step1_3 = _mm_sub_epi16(in04, in11);
michael@0 1214 step1_4 = _mm_sub_epi16(in03, in12);
michael@0 1215 step1_5 = _mm_sub_epi16(in02, in13);
michael@0 1216 step1_6 = _mm_sub_epi16(in01, in14);
michael@0 1217 step1_7 = _mm_sub_epi16(in00, in15);
michael@0 1218 }
michael@0 1219 // Work on the first eight values; fdct8_1d(input, even_results);
michael@0 1220 {
michael@0 1221 // Add/substract
michael@0 1222 const __m128i q0 = _mm_add_epi16(input0, input7);
michael@0 1223 const __m128i q1 = _mm_add_epi16(input1, input6);
michael@0 1224 const __m128i q2 = _mm_add_epi16(input2, input5);
michael@0 1225 const __m128i q3 = _mm_add_epi16(input3, input4);
michael@0 1226 const __m128i q4 = _mm_sub_epi16(input3, input4);
michael@0 1227 const __m128i q5 = _mm_sub_epi16(input2, input5);
michael@0 1228 const __m128i q6 = _mm_sub_epi16(input1, input6);
michael@0 1229 const __m128i q7 = _mm_sub_epi16(input0, input7);
michael@0 1230 // Work on first four results
michael@0 1231 {
michael@0 1232 // Add/substract
michael@0 1233 const __m128i r0 = _mm_add_epi16(q0, q3);
michael@0 1234 const __m128i r1 = _mm_add_epi16(q1, q2);
michael@0 1235 const __m128i r2 = _mm_sub_epi16(q1, q2);
michael@0 1236 const __m128i r3 = _mm_sub_epi16(q0, q3);
michael@0 1237 // Interleave to do the multiply by constants which gets us
michael@0 1238 // into 32 bits.
michael@0 1239 const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
michael@0 1240 const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
michael@0 1241 const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
michael@0 1242 const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
michael@0 1243 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
michael@0 1244 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
michael@0 1245 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
michael@0 1246 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
michael@0 1247 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
michael@0 1248 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
michael@0 1249 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
michael@0 1250 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
michael@0 1251 // dct_const_round_shift
michael@0 1252 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
michael@0 1253 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
michael@0 1254 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
michael@0 1255 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
michael@0 1256 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
michael@0 1257 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
michael@0 1258 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
michael@0 1259 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
michael@0 1260 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
michael@0 1261 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
michael@0 1262 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
michael@0 1263 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
michael@0 1264 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
michael@0 1265 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
michael@0 1266 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
michael@0 1267 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
michael@0 1268 // Combine
michael@0 1269 res00 = _mm_packs_epi32(w0, w1);
michael@0 1270 res08 = _mm_packs_epi32(w2, w3);
michael@0 1271 res04 = _mm_packs_epi32(w4, w5);
michael@0 1272 res12 = _mm_packs_epi32(w6, w7);
michael@0 1273 }
michael@0 1274 // Work on next four results
michael@0 1275 {
michael@0 1276 // Interleave to do the multiply by constants which gets us
michael@0 1277 // into 32 bits.
michael@0 1278 const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
michael@0 1279 const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
michael@0 1280 const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
michael@0 1281 const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
michael@0 1282 const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
michael@0 1283 const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
michael@0 1284 // dct_const_round_shift
michael@0 1285 const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
michael@0 1286 const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
michael@0 1287 const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
michael@0 1288 const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
michael@0 1289 const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
michael@0 1290 const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
michael@0 1291 const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
michael@0 1292 const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
michael@0 1293 // Combine
michael@0 1294 const __m128i r0 = _mm_packs_epi32(s0, s1);
michael@0 1295 const __m128i r1 = _mm_packs_epi32(s2, s3);
michael@0 1296 // Add/substract
michael@0 1297 const __m128i x0 = _mm_add_epi16(q4, r0);
michael@0 1298 const __m128i x1 = _mm_sub_epi16(q4, r0);
michael@0 1299 const __m128i x2 = _mm_sub_epi16(q7, r1);
michael@0 1300 const __m128i x3 = _mm_add_epi16(q7, r1);
michael@0 1301 // Interleave to do the multiply by constants which gets us
michael@0 1302 // into 32 bits.
michael@0 1303 const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
michael@0 1304 const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
michael@0 1305 const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
michael@0 1306 const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
michael@0 1307 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
michael@0 1308 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
michael@0 1309 const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
michael@0 1310 const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
michael@0 1311 const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
michael@0 1312 const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
michael@0 1313 const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
michael@0 1314 const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
michael@0 1315 // dct_const_round_shift
michael@0 1316 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
michael@0 1317 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
michael@0 1318 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
michael@0 1319 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
michael@0 1320 const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
michael@0 1321 const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
michael@0 1322 const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
michael@0 1323 const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
michael@0 1324 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
michael@0 1325 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
michael@0 1326 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
michael@0 1327 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
michael@0 1328 const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
michael@0 1329 const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
michael@0 1330 const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
michael@0 1331 const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
michael@0 1332 // Combine
michael@0 1333 res02 = _mm_packs_epi32(w0, w1);
michael@0 1334 res14 = _mm_packs_epi32(w2, w3);
michael@0 1335 res10 = _mm_packs_epi32(w4, w5);
michael@0 1336 res06 = _mm_packs_epi32(w6, w7);
michael@0 1337 }
michael@0 1338 }
michael@0 1339 // Work on the next eight values; step1 -> odd_results
michael@0 1340 {
michael@0 1341 // step 2
michael@0 1342 {
michael@0 1343 const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
michael@0 1344 const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
michael@0 1345 const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
michael@0 1346 const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
michael@0 1347 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
michael@0 1348 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
michael@0 1349 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
michael@0 1350 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
michael@0 1351 // dct_const_round_shift
michael@0 1352 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
michael@0 1353 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
michael@0 1354 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
michael@0 1355 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
michael@0 1356 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
michael@0 1357 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
michael@0 1358 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
michael@0 1359 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
michael@0 1360 // Combine
michael@0 1361 step2_2 = _mm_packs_epi32(w0, w1);
michael@0 1362 step2_3 = _mm_packs_epi32(w2, w3);
michael@0 1363 }
michael@0 1364 {
michael@0 1365 const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
michael@0 1366 const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
michael@0 1367 const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
michael@0 1368 const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
michael@0 1369 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
michael@0 1370 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
michael@0 1371 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
michael@0 1372 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
michael@0 1373 // dct_const_round_shift
michael@0 1374 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
michael@0 1375 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
michael@0 1376 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
michael@0 1377 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
michael@0 1378 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
michael@0 1379 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
michael@0 1380 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
michael@0 1381 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
michael@0 1382 // Combine
michael@0 1383 step2_5 = _mm_packs_epi32(w0, w1);
michael@0 1384 step2_4 = _mm_packs_epi32(w2, w3);
michael@0 1385 }
michael@0 1386 // step 3
michael@0 1387 {
michael@0 1388 step3_0 = _mm_add_epi16(step1_0, step2_3);
michael@0 1389 step3_1 = _mm_add_epi16(step1_1, step2_2);
michael@0 1390 step3_2 = _mm_sub_epi16(step1_1, step2_2);
michael@0 1391 step3_3 = _mm_sub_epi16(step1_0, step2_3);
michael@0 1392 step3_4 = _mm_sub_epi16(step1_7, step2_4);
michael@0 1393 step3_5 = _mm_sub_epi16(step1_6, step2_5);
michael@0 1394 step3_6 = _mm_add_epi16(step1_6, step2_5);
michael@0 1395 step3_7 = _mm_add_epi16(step1_7, step2_4);
michael@0 1396 }
michael@0 1397 // step 4
michael@0 1398 {
michael@0 1399 const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
michael@0 1400 const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
michael@0 1401 const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
michael@0 1402 const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
michael@0 1403 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
michael@0 1404 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
michael@0 1405 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
michael@0 1406 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
michael@0 1407 // dct_const_round_shift
michael@0 1408 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
michael@0 1409 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
michael@0 1410 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
michael@0 1411 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
michael@0 1412 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
michael@0 1413 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
michael@0 1414 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
michael@0 1415 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
michael@0 1416 // Combine
michael@0 1417 step2_1 = _mm_packs_epi32(w0, w1);
michael@0 1418 step2_2 = _mm_packs_epi32(w2, w3);
michael@0 1419 }
michael@0 1420 {
michael@0 1421 const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
michael@0 1422 const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
michael@0 1423 const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
michael@0 1424 const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
michael@0 1425 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
michael@0 1426 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
michael@0 1427 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
michael@0 1428 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
michael@0 1429 // dct_const_round_shift
michael@0 1430 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
michael@0 1431 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
michael@0 1432 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
michael@0 1433 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
michael@0 1434 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
michael@0 1435 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
michael@0 1436 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
michael@0 1437 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
michael@0 1438 // Combine
michael@0 1439 step2_6 = _mm_packs_epi32(w0, w1);
michael@0 1440 step2_5 = _mm_packs_epi32(w2, w3);
michael@0 1441 }
michael@0 1442 // step 5
michael@0 1443 {
michael@0 1444 step1_0 = _mm_add_epi16(step3_0, step2_1);
michael@0 1445 step1_1 = _mm_sub_epi16(step3_0, step2_1);
michael@0 1446 step1_2 = _mm_sub_epi16(step3_3, step2_2);
michael@0 1447 step1_3 = _mm_add_epi16(step3_3, step2_2);
michael@0 1448 step1_4 = _mm_add_epi16(step3_4, step2_5);
michael@0 1449 step1_5 = _mm_sub_epi16(step3_4, step2_5);
michael@0 1450 step1_6 = _mm_sub_epi16(step3_7, step2_6);
michael@0 1451 step1_7 = _mm_add_epi16(step3_7, step2_6);
michael@0 1452 }
michael@0 1453 // step 6
michael@0 1454 {
michael@0 1455 const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
michael@0 1456 const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
michael@0 1457 const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
michael@0 1458 const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
michael@0 1459 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
michael@0 1460 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
michael@0 1461 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
michael@0 1462 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
michael@0 1463 // dct_const_round_shift
michael@0 1464 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
michael@0 1465 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
michael@0 1466 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
michael@0 1467 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
michael@0 1468 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
michael@0 1469 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
michael@0 1470 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
michael@0 1471 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
michael@0 1472 // Combine
michael@0 1473 res01 = _mm_packs_epi32(w0, w1);
michael@0 1474 res09 = _mm_packs_epi32(w2, w3);
michael@0 1475 }
michael@0 1476 {
michael@0 1477 const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
michael@0 1478 const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
michael@0 1479 const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
michael@0 1480 const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
michael@0 1481 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
michael@0 1482 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
michael@0 1483 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
michael@0 1484 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
michael@0 1485 // dct_const_round_shift
michael@0 1486 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
michael@0 1487 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
michael@0 1488 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
michael@0 1489 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
michael@0 1490 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
michael@0 1491 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
michael@0 1492 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
michael@0 1493 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
michael@0 1494 // Combine
michael@0 1495 res05 = _mm_packs_epi32(w0, w1);
michael@0 1496 res13 = _mm_packs_epi32(w2, w3);
michael@0 1497 }
michael@0 1498 {
michael@0 1499 const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
michael@0 1500 const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
michael@0 1501 const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
michael@0 1502 const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
michael@0 1503 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
michael@0 1504 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
michael@0 1505 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
michael@0 1506 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
michael@0 1507 // dct_const_round_shift
michael@0 1508 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
michael@0 1509 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
michael@0 1510 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
michael@0 1511 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
michael@0 1512 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
michael@0 1513 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
michael@0 1514 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
michael@0 1515 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
michael@0 1516 // Combine
michael@0 1517 res11 = _mm_packs_epi32(w0, w1);
michael@0 1518 res03 = _mm_packs_epi32(w2, w3);
michael@0 1519 }
michael@0 1520 {
michael@0 1521 const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
michael@0 1522 const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
michael@0 1523 const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
michael@0 1524 const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
michael@0 1525 const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
michael@0 1526 const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
michael@0 1527 const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
michael@0 1528 const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
michael@0 1529 // dct_const_round_shift
michael@0 1530 const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
michael@0 1531 const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
michael@0 1532 const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
michael@0 1533 const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
michael@0 1534 const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
michael@0 1535 const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
michael@0 1536 const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
michael@0 1537 const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
michael@0 1538 // Combine
michael@0 1539 res15 = _mm_packs_epi32(w0, w1);
michael@0 1540 res07 = _mm_packs_epi32(w2, w3);
michael@0 1541 }
michael@0 1542 }
michael@0 1543 // Transpose the results, do it as two 8x8 transposes.
michael@0 1544 {
michael@0 1545 // 00 01 02 03 04 05 06 07
michael@0 1546 // 10 11 12 13 14 15 16 17
michael@0 1547 // 20 21 22 23 24 25 26 27
michael@0 1548 // 30 31 32 33 34 35 36 37
michael@0 1549 // 40 41 42 43 44 45 46 47
michael@0 1550 // 50 51 52 53 54 55 56 57
michael@0 1551 // 60 61 62 63 64 65 66 67
michael@0 1552 // 70 71 72 73 74 75 76 77
michael@0 1553 const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
michael@0 1554 const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
michael@0 1555 const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
michael@0 1556 const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
michael@0 1557 const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
michael@0 1558 const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
michael@0 1559 const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
michael@0 1560 const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
michael@0 1561 // 00 10 01 11 02 12 03 13
michael@0 1562 // 20 30 21 31 22 32 23 33
michael@0 1563 // 04 14 05 15 06 16 07 17
michael@0 1564 // 24 34 25 35 26 36 27 37
michael@0 1565 // 40 50 41 51 42 52 43 53
michael@0 1566 // 60 70 61 71 62 72 63 73
michael@0 1567 // 54 54 55 55 56 56 57 57
michael@0 1568 // 64 74 65 75 66 76 67 77
michael@0 1569 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
michael@0 1570 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
michael@0 1571 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
michael@0 1572 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
michael@0 1573 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
michael@0 1574 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
michael@0 1575 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
michael@0 1576 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
michael@0 1577 // 00 10 20 30 01 11 21 31
michael@0 1578 // 40 50 60 70 41 51 61 71
michael@0 1579 // 02 12 22 32 03 13 23 33
michael@0 1580 // 42 52 62 72 43 53 63 73
michael@0 1581 // 04 14 24 34 05 15 21 36
michael@0 1582 // 44 54 64 74 45 55 61 76
michael@0 1583 // 06 16 26 36 07 17 27 37
michael@0 1584 // 46 56 66 76 47 57 67 77
michael@0 1585 const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
michael@0 1586 const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
michael@0 1587 const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
michael@0 1588 const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
michael@0 1589 const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
michael@0 1590 const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
michael@0 1591 const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
michael@0 1592 const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
michael@0 1593 // 00 10 20 30 40 50 60 70
michael@0 1594 // 01 11 21 31 41 51 61 71
michael@0 1595 // 02 12 22 32 42 52 62 72
michael@0 1596 // 03 13 23 33 43 53 63 73
michael@0 1597 // 04 14 24 34 44 54 64 74
michael@0 1598 // 05 15 25 35 45 55 65 75
michael@0 1599 // 06 16 26 36 46 56 66 76
michael@0 1600 // 07 17 27 37 47 57 67 77
michael@0 1601 _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
michael@0 1602 _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
michael@0 1603 _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
michael@0 1604 _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
michael@0 1605 _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
michael@0 1606 _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
michael@0 1607 _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
michael@0 1608 _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
michael@0 1609 }
michael@0 1610 {
michael@0 1611 // 00 01 02 03 04 05 06 07
michael@0 1612 // 10 11 12 13 14 15 16 17
michael@0 1613 // 20 21 22 23 24 25 26 27
michael@0 1614 // 30 31 32 33 34 35 36 37
michael@0 1615 // 40 41 42 43 44 45 46 47
michael@0 1616 // 50 51 52 53 54 55 56 57
michael@0 1617 // 60 61 62 63 64 65 66 67
michael@0 1618 // 70 71 72 73 74 75 76 77
michael@0 1619 const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
michael@0 1620 const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
michael@0 1621 const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
michael@0 1622 const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
michael@0 1623 const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
michael@0 1624 const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
michael@0 1625 const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
michael@0 1626 const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
michael@0 1627 // 00 10 01 11 02 12 03 13
michael@0 1628 // 20 30 21 31 22 32 23 33
michael@0 1629 // 04 14 05 15 06 16 07 17
michael@0 1630 // 24 34 25 35 26 36 27 37
michael@0 1631 // 40 50 41 51 42 52 43 53
michael@0 1632 // 60 70 61 71 62 72 63 73
michael@0 1633 // 54 54 55 55 56 56 57 57
michael@0 1634 // 64 74 65 75 66 76 67 77
michael@0 1635 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
michael@0 1636 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
michael@0 1637 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
michael@0 1638 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
michael@0 1639 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
michael@0 1640 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
michael@0 1641 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
michael@0 1642 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
michael@0 1643 // 00 10 20 30 01 11 21 31
michael@0 1644 // 40 50 60 70 41 51 61 71
michael@0 1645 // 02 12 22 32 03 13 23 33
michael@0 1646 // 42 52 62 72 43 53 63 73
michael@0 1647 // 04 14 24 34 05 15 21 36
michael@0 1648 // 44 54 64 74 45 55 61 76
michael@0 1649 // 06 16 26 36 07 17 27 37
michael@0 1650 // 46 56 66 76 47 57 67 77
michael@0 1651 const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
michael@0 1652 const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
michael@0 1653 const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
michael@0 1654 const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
michael@0 1655 const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
michael@0 1656 const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
michael@0 1657 const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
michael@0 1658 const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
michael@0 1659 // 00 10 20 30 40 50 60 70
michael@0 1660 // 01 11 21 31 41 51 61 71
michael@0 1661 // 02 12 22 32 42 52 62 72
michael@0 1662 // 03 13 23 33 43 53 63 73
michael@0 1663 // 04 14 24 34 44 54 64 74
michael@0 1664 // 05 15 25 35 45 55 65 75
michael@0 1665 // 06 16 26 36 46 56 66 76
michael@0 1666 // 07 17 27 37 47 57 67 77
michael@0 1667 // Store results
michael@0 1668 _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
michael@0 1669 _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
michael@0 1670 _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
michael@0 1671 _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
michael@0 1672 _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
michael@0 1673 _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
michael@0 1674 _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
michael@0 1675 _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
michael@0 1676 }
michael@0 1677 out += 8*16;
michael@0 1678 }
michael@0 1679 // Setup in/out for next pass.
michael@0 1680 in = intermediate;
michael@0 1681 out = output;
michael@0 1682 }
michael@0 1683 }
michael@0 1684
michael@0 1685 static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
michael@0 1686 __m128i *in1, int stride) {
michael@0 1687 // load first 8 columns
michael@0 1688 load_buffer_8x8(input, in0, stride);
michael@0 1689 load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
michael@0 1690
michael@0 1691 input += 8;
michael@0 1692 // load second 8 columns
michael@0 1693 load_buffer_8x8(input, in1, stride);
michael@0 1694 load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
michael@0 1695 }
michael@0 1696
michael@0 1697 static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0,
michael@0 1698 __m128i *in1, int stride) {
michael@0 1699 // write first 8 columns
michael@0 1700 write_buffer_8x8(output, in0, stride);
michael@0 1701 write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
michael@0 1702 // write second 8 columns
michael@0 1703 output += 8;
michael@0 1704 write_buffer_8x8(output, in1, stride);
michael@0 1705 write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
michael@0 1706 }
michael@0 1707
michael@0 1708 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
michael@0 1709 __m128i tbuf[8];
michael@0 1710 array_transpose_8x8(res0, res0);
michael@0 1711 array_transpose_8x8(res1, tbuf);
michael@0 1712 array_transpose_8x8(res0 + 8, res1);
michael@0 1713 array_transpose_8x8(res1 + 8, res1 + 8);
michael@0 1714
michael@0 1715 res0[8] = tbuf[0];
michael@0 1716 res0[9] = tbuf[1];
michael@0 1717 res0[10] = tbuf[2];
michael@0 1718 res0[11] = tbuf[3];
michael@0 1719 res0[12] = tbuf[4];
michael@0 1720 res0[13] = tbuf[5];
michael@0 1721 res0[14] = tbuf[6];
michael@0 1722 res0[15] = tbuf[7];
michael@0 1723 }
michael@0 1724
michael@0 1725 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
michael@0 1726 // perform rounding operations
michael@0 1727 right_shift_8x8(res0, 2);
michael@0 1728 right_shift_8x8(res0 + 8, 2);
michael@0 1729 right_shift_8x8(res1, 2);
michael@0 1730 right_shift_8x8(res1 + 8, 2);
michael@0 1731 }
michael@0 1732
michael@0 1733 void fdct16_1d_8col(__m128i *in) {
michael@0 1734 // perform 16x16 1-D DCT for 8 columns
michael@0 1735 __m128i i[8], s[8], p[8], t[8], u[16], v[16];
michael@0 1736 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
michael@0 1737 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 1738 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
michael@0 1739 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
michael@0 1740 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
michael@0 1741 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
michael@0 1742 const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
michael@0 1743 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
michael@0 1744 const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
michael@0 1745 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
michael@0 1746 const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
michael@0 1747 const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
michael@0 1748 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
michael@0 1749 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
michael@0 1750 const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
michael@0 1751 const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
michael@0 1752 const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
michael@0 1753 const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
michael@0 1754 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 1755
michael@0 1756 // stage 1
michael@0 1757 i[0] = _mm_add_epi16(in[0], in[15]);
michael@0 1758 i[1] = _mm_add_epi16(in[1], in[14]);
michael@0 1759 i[2] = _mm_add_epi16(in[2], in[13]);
michael@0 1760 i[3] = _mm_add_epi16(in[3], in[12]);
michael@0 1761 i[4] = _mm_add_epi16(in[4], in[11]);
michael@0 1762 i[5] = _mm_add_epi16(in[5], in[10]);
michael@0 1763 i[6] = _mm_add_epi16(in[6], in[9]);
michael@0 1764 i[7] = _mm_add_epi16(in[7], in[8]);
michael@0 1765
michael@0 1766 s[0] = _mm_sub_epi16(in[7], in[8]);
michael@0 1767 s[1] = _mm_sub_epi16(in[6], in[9]);
michael@0 1768 s[2] = _mm_sub_epi16(in[5], in[10]);
michael@0 1769 s[3] = _mm_sub_epi16(in[4], in[11]);
michael@0 1770 s[4] = _mm_sub_epi16(in[3], in[12]);
michael@0 1771 s[5] = _mm_sub_epi16(in[2], in[13]);
michael@0 1772 s[6] = _mm_sub_epi16(in[1], in[14]);
michael@0 1773 s[7] = _mm_sub_epi16(in[0], in[15]);
michael@0 1774
michael@0 1775 p[0] = _mm_add_epi16(i[0], i[7]);
michael@0 1776 p[1] = _mm_add_epi16(i[1], i[6]);
michael@0 1777 p[2] = _mm_add_epi16(i[2], i[5]);
michael@0 1778 p[3] = _mm_add_epi16(i[3], i[4]);
michael@0 1779 p[4] = _mm_sub_epi16(i[3], i[4]);
michael@0 1780 p[5] = _mm_sub_epi16(i[2], i[5]);
michael@0 1781 p[6] = _mm_sub_epi16(i[1], i[6]);
michael@0 1782 p[7] = _mm_sub_epi16(i[0], i[7]);
michael@0 1783
michael@0 1784 u[0] = _mm_add_epi16(p[0], p[3]);
michael@0 1785 u[1] = _mm_add_epi16(p[1], p[2]);
michael@0 1786 u[2] = _mm_sub_epi16(p[1], p[2]);
michael@0 1787 u[3] = _mm_sub_epi16(p[0], p[3]);
michael@0 1788
michael@0 1789 v[0] = _mm_unpacklo_epi16(u[0], u[1]);
michael@0 1790 v[1] = _mm_unpackhi_epi16(u[0], u[1]);
michael@0 1791 v[2] = _mm_unpacklo_epi16(u[2], u[3]);
michael@0 1792 v[3] = _mm_unpackhi_epi16(u[2], u[3]);
michael@0 1793
michael@0 1794 u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
michael@0 1795 u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
michael@0 1796 u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
michael@0 1797 u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
michael@0 1798 u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
michael@0 1799 u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
michael@0 1800 u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
michael@0 1801 u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
michael@0 1802
michael@0 1803 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
michael@0 1804 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
michael@0 1805 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
michael@0 1806 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
michael@0 1807 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
michael@0 1808 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
michael@0 1809 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
michael@0 1810 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
michael@0 1811
michael@0 1812 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
michael@0 1813 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
michael@0 1814 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
michael@0 1815 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
michael@0 1816 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
michael@0 1817 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
michael@0 1818 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
michael@0 1819 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
michael@0 1820
michael@0 1821 in[0] = _mm_packs_epi32(u[0], u[1]);
michael@0 1822 in[4] = _mm_packs_epi32(u[4], u[5]);
michael@0 1823 in[8] = _mm_packs_epi32(u[2], u[3]);
michael@0 1824 in[12] = _mm_packs_epi32(u[6], u[7]);
michael@0 1825
michael@0 1826 u[0] = _mm_unpacklo_epi16(p[5], p[6]);
michael@0 1827 u[1] = _mm_unpackhi_epi16(p[5], p[6]);
michael@0 1828 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
michael@0 1829 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
michael@0 1830 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
michael@0 1831 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
michael@0 1832
michael@0 1833 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
michael@0 1834 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
michael@0 1835 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
michael@0 1836 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
michael@0 1837
michael@0 1838 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
michael@0 1839 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
michael@0 1840 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
michael@0 1841 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
michael@0 1842
michael@0 1843 u[0] = _mm_packs_epi32(v[0], v[1]);
michael@0 1844 u[1] = _mm_packs_epi32(v[2], v[3]);
michael@0 1845
michael@0 1846 t[0] = _mm_add_epi16(p[4], u[0]);
michael@0 1847 t[1] = _mm_sub_epi16(p[4], u[0]);
michael@0 1848 t[2] = _mm_sub_epi16(p[7], u[1]);
michael@0 1849 t[3] = _mm_add_epi16(p[7], u[1]);
michael@0 1850
michael@0 1851 u[0] = _mm_unpacklo_epi16(t[0], t[3]);
michael@0 1852 u[1] = _mm_unpackhi_epi16(t[0], t[3]);
michael@0 1853 u[2] = _mm_unpacklo_epi16(t[1], t[2]);
michael@0 1854 u[3] = _mm_unpackhi_epi16(t[1], t[2]);
michael@0 1855
michael@0 1856 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
michael@0 1857 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
michael@0 1858 v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
michael@0 1859 v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
michael@0 1860 v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
michael@0 1861 v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
michael@0 1862 v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
michael@0 1863 v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
michael@0 1864
michael@0 1865 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
michael@0 1866 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
michael@0 1867 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
michael@0 1868 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
michael@0 1869 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
michael@0 1870 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
michael@0 1871 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
michael@0 1872 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
michael@0 1873
michael@0 1874 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
michael@0 1875 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
michael@0 1876 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
michael@0 1877 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
michael@0 1878 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
michael@0 1879 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
michael@0 1880 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
michael@0 1881 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
michael@0 1882
michael@0 1883 in[2] = _mm_packs_epi32(v[0], v[1]);
michael@0 1884 in[6] = _mm_packs_epi32(v[4], v[5]);
michael@0 1885 in[10] = _mm_packs_epi32(v[2], v[3]);
michael@0 1886 in[14] = _mm_packs_epi32(v[6], v[7]);
michael@0 1887
michael@0 1888 // stage 2
michael@0 1889 u[0] = _mm_unpacklo_epi16(s[2], s[5]);
michael@0 1890 u[1] = _mm_unpackhi_epi16(s[2], s[5]);
michael@0 1891 u[2] = _mm_unpacklo_epi16(s[3], s[4]);
michael@0 1892 u[3] = _mm_unpackhi_epi16(s[3], s[4]);
michael@0 1893
michael@0 1894 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
michael@0 1895 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
michael@0 1896 v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
michael@0 1897 v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
michael@0 1898 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
michael@0 1899 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
michael@0 1900 v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
michael@0 1901 v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
michael@0 1902
michael@0 1903 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
michael@0 1904 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
michael@0 1905 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
michael@0 1906 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
michael@0 1907 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
michael@0 1908 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
michael@0 1909 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
michael@0 1910 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
michael@0 1911
michael@0 1912 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
michael@0 1913 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
michael@0 1914 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
michael@0 1915 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
michael@0 1916 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
michael@0 1917 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
michael@0 1918 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
michael@0 1919 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
michael@0 1920
michael@0 1921 t[2] = _mm_packs_epi32(v[0], v[1]);
michael@0 1922 t[3] = _mm_packs_epi32(v[2], v[3]);
michael@0 1923 t[4] = _mm_packs_epi32(v[4], v[5]);
michael@0 1924 t[5] = _mm_packs_epi32(v[6], v[7]);
michael@0 1925
michael@0 1926 // stage 3
michael@0 1927 p[0] = _mm_add_epi16(s[0], t[3]);
michael@0 1928 p[1] = _mm_add_epi16(s[1], t[2]);
michael@0 1929 p[2] = _mm_sub_epi16(s[1], t[2]);
michael@0 1930 p[3] = _mm_sub_epi16(s[0], t[3]);
michael@0 1931 p[4] = _mm_sub_epi16(s[7], t[4]);
michael@0 1932 p[5] = _mm_sub_epi16(s[6], t[5]);
michael@0 1933 p[6] = _mm_add_epi16(s[6], t[5]);
michael@0 1934 p[7] = _mm_add_epi16(s[7], t[4]);
michael@0 1935
michael@0 1936 // stage 4
michael@0 1937 u[0] = _mm_unpacklo_epi16(p[1], p[6]);
michael@0 1938 u[1] = _mm_unpackhi_epi16(p[1], p[6]);
michael@0 1939 u[2] = _mm_unpacklo_epi16(p[2], p[5]);
michael@0 1940 u[3] = _mm_unpackhi_epi16(p[2], p[5]);
michael@0 1941
michael@0 1942 v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
michael@0 1943 v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
michael@0 1944 v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
michael@0 1945 v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
michael@0 1946 v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
michael@0 1947 v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
michael@0 1948 v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
michael@0 1949 v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
michael@0 1950
michael@0 1951 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
michael@0 1952 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
michael@0 1953 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
michael@0 1954 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
michael@0 1955 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
michael@0 1956 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
michael@0 1957 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
michael@0 1958 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
michael@0 1959
michael@0 1960 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
michael@0 1961 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
michael@0 1962 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
michael@0 1963 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
michael@0 1964 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
michael@0 1965 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
michael@0 1966 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
michael@0 1967 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
michael@0 1968
michael@0 1969 t[1] = _mm_packs_epi32(v[0], v[1]);
michael@0 1970 t[2] = _mm_packs_epi32(v[2], v[3]);
michael@0 1971 t[5] = _mm_packs_epi32(v[4], v[5]);
michael@0 1972 t[6] = _mm_packs_epi32(v[6], v[7]);
michael@0 1973
michael@0 1974 // stage 5
michael@0 1975 s[0] = _mm_add_epi16(p[0], t[1]);
michael@0 1976 s[1] = _mm_sub_epi16(p[0], t[1]);
michael@0 1977 s[2] = _mm_sub_epi16(p[3], t[2]);
michael@0 1978 s[3] = _mm_add_epi16(p[3], t[2]);
michael@0 1979 s[4] = _mm_add_epi16(p[4], t[5]);
michael@0 1980 s[5] = _mm_sub_epi16(p[4], t[5]);
michael@0 1981 s[6] = _mm_sub_epi16(p[7], t[6]);
michael@0 1982 s[7] = _mm_add_epi16(p[7], t[6]);
michael@0 1983
michael@0 1984 // stage 6
michael@0 1985 u[0] = _mm_unpacklo_epi16(s[0], s[7]);
michael@0 1986 u[1] = _mm_unpackhi_epi16(s[0], s[7]);
michael@0 1987 u[2] = _mm_unpacklo_epi16(s[1], s[6]);
michael@0 1988 u[3] = _mm_unpackhi_epi16(s[1], s[6]);
michael@0 1989 u[4] = _mm_unpacklo_epi16(s[2], s[5]);
michael@0 1990 u[5] = _mm_unpackhi_epi16(s[2], s[5]);
michael@0 1991 u[6] = _mm_unpacklo_epi16(s[3], s[4]);
michael@0 1992 u[7] = _mm_unpackhi_epi16(s[3], s[4]);
michael@0 1993
michael@0 1994 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
michael@0 1995 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
michael@0 1996 v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
michael@0 1997 v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
michael@0 1998 v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
michael@0 1999 v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
michael@0 2000 v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
michael@0 2001 v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
michael@0 2002 v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
michael@0 2003 v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
michael@0 2004 v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
michael@0 2005 v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
michael@0 2006 v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
michael@0 2007 v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
michael@0 2008 v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
michael@0 2009 v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
michael@0 2010
michael@0 2011 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
michael@0 2012 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
michael@0 2013 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
michael@0 2014 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
michael@0 2015 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
michael@0 2016 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
michael@0 2017 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
michael@0 2018 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
michael@0 2019 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
michael@0 2020 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
michael@0 2021 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
michael@0 2022 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
michael@0 2023 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
michael@0 2024 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
michael@0 2025 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
michael@0 2026 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
michael@0 2027
michael@0 2028 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
michael@0 2029 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
michael@0 2030 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
michael@0 2031 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
michael@0 2032 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
michael@0 2033 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
michael@0 2034 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
michael@0 2035 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
michael@0 2036 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
michael@0 2037 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
michael@0 2038 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
michael@0 2039 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
michael@0 2040 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
michael@0 2041 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
michael@0 2042 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
michael@0 2043 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
michael@0 2044
michael@0 2045 in[1] = _mm_packs_epi32(v[0], v[1]);
michael@0 2046 in[9] = _mm_packs_epi32(v[2], v[3]);
michael@0 2047 in[5] = _mm_packs_epi32(v[4], v[5]);
michael@0 2048 in[13] = _mm_packs_epi32(v[6], v[7]);
michael@0 2049 in[3] = _mm_packs_epi32(v[8], v[9]);
michael@0 2050 in[11] = _mm_packs_epi32(v[10], v[11]);
michael@0 2051 in[7] = _mm_packs_epi32(v[12], v[13]);
michael@0 2052 in[15] = _mm_packs_epi32(v[14], v[15]);
michael@0 2053 }
michael@0 2054
michael@0 2055 void fadst16_1d_8col(__m128i *in) {
michael@0 2056 // perform 16x16 1-D ADST for 8 columns
michael@0 2057 __m128i s[16], x[16], u[32], v[32];
michael@0 2058 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
michael@0 2059 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
michael@0 2060 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
michael@0 2061 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
michael@0 2062 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
michael@0 2063 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
michael@0 2064 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
michael@0 2065 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
michael@0 2066 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
michael@0 2067 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
michael@0 2068 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
michael@0 2069 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
michael@0 2070 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
michael@0 2071 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
michael@0 2072 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
michael@0 2073 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
michael@0 2074 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
michael@0 2075 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
michael@0 2076 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
michael@0 2077 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
michael@0 2078 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
michael@0 2079 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
michael@0 2080 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
michael@0 2081 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
michael@0 2082 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
michael@0 2083 const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
michael@0 2084 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
michael@0 2085 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
michael@0 2086 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
michael@0 2087 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 2088 const __m128i kZero = _mm_set1_epi16(0);
michael@0 2089
michael@0 2090 u[0] = _mm_unpacklo_epi16(in[15], in[0]);
michael@0 2091 u[1] = _mm_unpackhi_epi16(in[15], in[0]);
michael@0 2092 u[2] = _mm_unpacklo_epi16(in[13], in[2]);
michael@0 2093 u[3] = _mm_unpackhi_epi16(in[13], in[2]);
michael@0 2094 u[4] = _mm_unpacklo_epi16(in[11], in[4]);
michael@0 2095 u[5] = _mm_unpackhi_epi16(in[11], in[4]);
michael@0 2096 u[6] = _mm_unpacklo_epi16(in[9], in[6]);
michael@0 2097 u[7] = _mm_unpackhi_epi16(in[9], in[6]);
michael@0 2098 u[8] = _mm_unpacklo_epi16(in[7], in[8]);
michael@0 2099 u[9] = _mm_unpackhi_epi16(in[7], in[8]);
michael@0 2100 u[10] = _mm_unpacklo_epi16(in[5], in[10]);
michael@0 2101 u[11] = _mm_unpackhi_epi16(in[5], in[10]);
michael@0 2102 u[12] = _mm_unpacklo_epi16(in[3], in[12]);
michael@0 2103 u[13] = _mm_unpackhi_epi16(in[3], in[12]);
michael@0 2104 u[14] = _mm_unpacklo_epi16(in[1], in[14]);
michael@0 2105 u[15] = _mm_unpackhi_epi16(in[1], in[14]);
michael@0 2106
michael@0 2107 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
michael@0 2108 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
michael@0 2109 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
michael@0 2110 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
michael@0 2111 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
michael@0 2112 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
michael@0 2113 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
michael@0 2114 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
michael@0 2115 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
michael@0 2116 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
michael@0 2117 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
michael@0 2118 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
michael@0 2119 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
michael@0 2120 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
michael@0 2121 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
michael@0 2122 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
michael@0 2123 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
michael@0 2124 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
michael@0 2125 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
michael@0 2126 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
michael@0 2127 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
michael@0 2128 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
michael@0 2129 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
michael@0 2130 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
michael@0 2131 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
michael@0 2132 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
michael@0 2133 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
michael@0 2134 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
michael@0 2135 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
michael@0 2136 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
michael@0 2137 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
michael@0 2138 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
michael@0 2139
michael@0 2140 u[0] = _mm_add_epi32(v[0], v[16]);
michael@0 2141 u[1] = _mm_add_epi32(v[1], v[17]);
michael@0 2142 u[2] = _mm_add_epi32(v[2], v[18]);
michael@0 2143 u[3] = _mm_add_epi32(v[3], v[19]);
michael@0 2144 u[4] = _mm_add_epi32(v[4], v[20]);
michael@0 2145 u[5] = _mm_add_epi32(v[5], v[21]);
michael@0 2146 u[6] = _mm_add_epi32(v[6], v[22]);
michael@0 2147 u[7] = _mm_add_epi32(v[7], v[23]);
michael@0 2148 u[8] = _mm_add_epi32(v[8], v[24]);
michael@0 2149 u[9] = _mm_add_epi32(v[9], v[25]);
michael@0 2150 u[10] = _mm_add_epi32(v[10], v[26]);
michael@0 2151 u[11] = _mm_add_epi32(v[11], v[27]);
michael@0 2152 u[12] = _mm_add_epi32(v[12], v[28]);
michael@0 2153 u[13] = _mm_add_epi32(v[13], v[29]);
michael@0 2154 u[14] = _mm_add_epi32(v[14], v[30]);
michael@0 2155 u[15] = _mm_add_epi32(v[15], v[31]);
michael@0 2156 u[16] = _mm_sub_epi32(v[0], v[16]);
michael@0 2157 u[17] = _mm_sub_epi32(v[1], v[17]);
michael@0 2158 u[18] = _mm_sub_epi32(v[2], v[18]);
michael@0 2159 u[19] = _mm_sub_epi32(v[3], v[19]);
michael@0 2160 u[20] = _mm_sub_epi32(v[4], v[20]);
michael@0 2161 u[21] = _mm_sub_epi32(v[5], v[21]);
michael@0 2162 u[22] = _mm_sub_epi32(v[6], v[22]);
michael@0 2163 u[23] = _mm_sub_epi32(v[7], v[23]);
michael@0 2164 u[24] = _mm_sub_epi32(v[8], v[24]);
michael@0 2165 u[25] = _mm_sub_epi32(v[9], v[25]);
michael@0 2166 u[26] = _mm_sub_epi32(v[10], v[26]);
michael@0 2167 u[27] = _mm_sub_epi32(v[11], v[27]);
michael@0 2168 u[28] = _mm_sub_epi32(v[12], v[28]);
michael@0 2169 u[29] = _mm_sub_epi32(v[13], v[29]);
michael@0 2170 u[30] = _mm_sub_epi32(v[14], v[30]);
michael@0 2171 u[31] = _mm_sub_epi32(v[15], v[31]);
michael@0 2172
michael@0 2173 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
michael@0 2174 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
michael@0 2175 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
michael@0 2176 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
michael@0 2177 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
michael@0 2178 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
michael@0 2179 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
michael@0 2180 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
michael@0 2181 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
michael@0 2182 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
michael@0 2183 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
michael@0 2184 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
michael@0 2185 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
michael@0 2186 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
michael@0 2187 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
michael@0 2188 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
michael@0 2189 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
michael@0 2190 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
michael@0 2191 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
michael@0 2192 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
michael@0 2193 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
michael@0 2194 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
michael@0 2195 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
michael@0 2196 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
michael@0 2197 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
michael@0 2198 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
michael@0 2199 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
michael@0 2200 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
michael@0 2201 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
michael@0 2202 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
michael@0 2203 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
michael@0 2204 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
michael@0 2205
michael@0 2206 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
michael@0 2207 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
michael@0 2208 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
michael@0 2209 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
michael@0 2210 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
michael@0 2211 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
michael@0 2212 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
michael@0 2213 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
michael@0 2214 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
michael@0 2215 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
michael@0 2216 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
michael@0 2217 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
michael@0 2218 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
michael@0 2219 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
michael@0 2220 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
michael@0 2221 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
michael@0 2222 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
michael@0 2223 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
michael@0 2224 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
michael@0 2225 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
michael@0 2226 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
michael@0 2227 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
michael@0 2228 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
michael@0 2229 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
michael@0 2230 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
michael@0 2231 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
michael@0 2232 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
michael@0 2233 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
michael@0 2234 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
michael@0 2235 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
michael@0 2236 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
michael@0 2237 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
michael@0 2238
michael@0 2239 s[0] = _mm_packs_epi32(u[0], u[1]);
michael@0 2240 s[1] = _mm_packs_epi32(u[2], u[3]);
michael@0 2241 s[2] = _mm_packs_epi32(u[4], u[5]);
michael@0 2242 s[3] = _mm_packs_epi32(u[6], u[7]);
michael@0 2243 s[4] = _mm_packs_epi32(u[8], u[9]);
michael@0 2244 s[5] = _mm_packs_epi32(u[10], u[11]);
michael@0 2245 s[6] = _mm_packs_epi32(u[12], u[13]);
michael@0 2246 s[7] = _mm_packs_epi32(u[14], u[15]);
michael@0 2247 s[8] = _mm_packs_epi32(u[16], u[17]);
michael@0 2248 s[9] = _mm_packs_epi32(u[18], u[19]);
michael@0 2249 s[10] = _mm_packs_epi32(u[20], u[21]);
michael@0 2250 s[11] = _mm_packs_epi32(u[22], u[23]);
michael@0 2251 s[12] = _mm_packs_epi32(u[24], u[25]);
michael@0 2252 s[13] = _mm_packs_epi32(u[26], u[27]);
michael@0 2253 s[14] = _mm_packs_epi32(u[28], u[29]);
michael@0 2254 s[15] = _mm_packs_epi32(u[30], u[31]);
michael@0 2255
michael@0 2256 // stage 2
michael@0 2257 u[0] = _mm_unpacklo_epi16(s[8], s[9]);
michael@0 2258 u[1] = _mm_unpackhi_epi16(s[8], s[9]);
michael@0 2259 u[2] = _mm_unpacklo_epi16(s[10], s[11]);
michael@0 2260 u[3] = _mm_unpackhi_epi16(s[10], s[11]);
michael@0 2261 u[4] = _mm_unpacklo_epi16(s[12], s[13]);
michael@0 2262 u[5] = _mm_unpackhi_epi16(s[12], s[13]);
michael@0 2263 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
michael@0 2264 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
michael@0 2265
michael@0 2266 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
michael@0 2267 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
michael@0 2268 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
michael@0 2269 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
michael@0 2270 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
michael@0 2271 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
michael@0 2272 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
michael@0 2273 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
michael@0 2274 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
michael@0 2275 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
michael@0 2276 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
michael@0 2277 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
michael@0 2278 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
michael@0 2279 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
michael@0 2280 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
michael@0 2281 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
michael@0 2282
michael@0 2283 u[0] = _mm_add_epi32(v[0], v[8]);
michael@0 2284 u[1] = _mm_add_epi32(v[1], v[9]);
michael@0 2285 u[2] = _mm_add_epi32(v[2], v[10]);
michael@0 2286 u[3] = _mm_add_epi32(v[3], v[11]);
michael@0 2287 u[4] = _mm_add_epi32(v[4], v[12]);
michael@0 2288 u[5] = _mm_add_epi32(v[5], v[13]);
michael@0 2289 u[6] = _mm_add_epi32(v[6], v[14]);
michael@0 2290 u[7] = _mm_add_epi32(v[7], v[15]);
michael@0 2291 u[8] = _mm_sub_epi32(v[0], v[8]);
michael@0 2292 u[9] = _mm_sub_epi32(v[1], v[9]);
michael@0 2293 u[10] = _mm_sub_epi32(v[2], v[10]);
michael@0 2294 u[11] = _mm_sub_epi32(v[3], v[11]);
michael@0 2295 u[12] = _mm_sub_epi32(v[4], v[12]);
michael@0 2296 u[13] = _mm_sub_epi32(v[5], v[13]);
michael@0 2297 u[14] = _mm_sub_epi32(v[6], v[14]);
michael@0 2298 u[15] = _mm_sub_epi32(v[7], v[15]);
michael@0 2299
michael@0 2300 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
michael@0 2301 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
michael@0 2302 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
michael@0 2303 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
michael@0 2304 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
michael@0 2305 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
michael@0 2306 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
michael@0 2307 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
michael@0 2308 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
michael@0 2309 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
michael@0 2310 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
michael@0 2311 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
michael@0 2312 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
michael@0 2313 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
michael@0 2314 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
michael@0 2315 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
michael@0 2316
michael@0 2317 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
michael@0 2318 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
michael@0 2319 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
michael@0 2320 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
michael@0 2321 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
michael@0 2322 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
michael@0 2323 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
michael@0 2324 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
michael@0 2325 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
michael@0 2326 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
michael@0 2327 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
michael@0 2328 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
michael@0 2329 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
michael@0 2330 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
michael@0 2331 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
michael@0 2332 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
michael@0 2333
michael@0 2334 x[0] = _mm_add_epi16(s[0], s[4]);
michael@0 2335 x[1] = _mm_add_epi16(s[1], s[5]);
michael@0 2336 x[2] = _mm_add_epi16(s[2], s[6]);
michael@0 2337 x[3] = _mm_add_epi16(s[3], s[7]);
michael@0 2338 x[4] = _mm_sub_epi16(s[0], s[4]);
michael@0 2339 x[5] = _mm_sub_epi16(s[1], s[5]);
michael@0 2340 x[6] = _mm_sub_epi16(s[2], s[6]);
michael@0 2341 x[7] = _mm_sub_epi16(s[3], s[7]);
michael@0 2342 x[8] = _mm_packs_epi32(u[0], u[1]);
michael@0 2343 x[9] = _mm_packs_epi32(u[2], u[3]);
michael@0 2344 x[10] = _mm_packs_epi32(u[4], u[5]);
michael@0 2345 x[11] = _mm_packs_epi32(u[6], u[7]);
michael@0 2346 x[12] = _mm_packs_epi32(u[8], u[9]);
michael@0 2347 x[13] = _mm_packs_epi32(u[10], u[11]);
michael@0 2348 x[14] = _mm_packs_epi32(u[12], u[13]);
michael@0 2349 x[15] = _mm_packs_epi32(u[14], u[15]);
michael@0 2350
michael@0 2351 // stage 3
michael@0 2352 u[0] = _mm_unpacklo_epi16(x[4], x[5]);
michael@0 2353 u[1] = _mm_unpackhi_epi16(x[4], x[5]);
michael@0 2354 u[2] = _mm_unpacklo_epi16(x[6], x[7]);
michael@0 2355 u[3] = _mm_unpackhi_epi16(x[6], x[7]);
michael@0 2356 u[4] = _mm_unpacklo_epi16(x[12], x[13]);
michael@0 2357 u[5] = _mm_unpackhi_epi16(x[12], x[13]);
michael@0 2358 u[6] = _mm_unpacklo_epi16(x[14], x[15]);
michael@0 2359 u[7] = _mm_unpackhi_epi16(x[14], x[15]);
michael@0 2360
michael@0 2361 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
michael@0 2362 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
michael@0 2363 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
michael@0 2364 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
michael@0 2365 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
michael@0 2366 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
michael@0 2367 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
michael@0 2368 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
michael@0 2369 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
michael@0 2370 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
michael@0 2371 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
michael@0 2372 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
michael@0 2373 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
michael@0 2374 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
michael@0 2375 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
michael@0 2376 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
michael@0 2377
michael@0 2378 u[0] = _mm_add_epi32(v[0], v[4]);
michael@0 2379 u[1] = _mm_add_epi32(v[1], v[5]);
michael@0 2380 u[2] = _mm_add_epi32(v[2], v[6]);
michael@0 2381 u[3] = _mm_add_epi32(v[3], v[7]);
michael@0 2382 u[4] = _mm_sub_epi32(v[0], v[4]);
michael@0 2383 u[5] = _mm_sub_epi32(v[1], v[5]);
michael@0 2384 u[6] = _mm_sub_epi32(v[2], v[6]);
michael@0 2385 u[7] = _mm_sub_epi32(v[3], v[7]);
michael@0 2386 u[8] = _mm_add_epi32(v[8], v[12]);
michael@0 2387 u[9] = _mm_add_epi32(v[9], v[13]);
michael@0 2388 u[10] = _mm_add_epi32(v[10], v[14]);
michael@0 2389 u[11] = _mm_add_epi32(v[11], v[15]);
michael@0 2390 u[12] = _mm_sub_epi32(v[8], v[12]);
michael@0 2391 u[13] = _mm_sub_epi32(v[9], v[13]);
michael@0 2392 u[14] = _mm_sub_epi32(v[10], v[14]);
michael@0 2393 u[15] = _mm_sub_epi32(v[11], v[15]);
michael@0 2394
michael@0 2395 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
michael@0 2396 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
michael@0 2397 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
michael@0 2398 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
michael@0 2399 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
michael@0 2400 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
michael@0 2401 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
michael@0 2402 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
michael@0 2403 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
michael@0 2404 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
michael@0 2405 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
michael@0 2406 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
michael@0 2407 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
michael@0 2408 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
michael@0 2409 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
michael@0 2410 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
michael@0 2411
michael@0 2412 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
michael@0 2413 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
michael@0 2414 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
michael@0 2415 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
michael@0 2416 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
michael@0 2417 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
michael@0 2418 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
michael@0 2419 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
michael@0 2420 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
michael@0 2421 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
michael@0 2422 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
michael@0 2423 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
michael@0 2424 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
michael@0 2425 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
michael@0 2426 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
michael@0 2427 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
michael@0 2428
michael@0 2429 s[0] = _mm_add_epi16(x[0], x[2]);
michael@0 2430 s[1] = _mm_add_epi16(x[1], x[3]);
michael@0 2431 s[2] = _mm_sub_epi16(x[0], x[2]);
michael@0 2432 s[3] = _mm_sub_epi16(x[1], x[3]);
michael@0 2433 s[4] = _mm_packs_epi32(v[0], v[1]);
michael@0 2434 s[5] = _mm_packs_epi32(v[2], v[3]);
michael@0 2435 s[6] = _mm_packs_epi32(v[4], v[5]);
michael@0 2436 s[7] = _mm_packs_epi32(v[6], v[7]);
michael@0 2437 s[8] = _mm_add_epi16(x[8], x[10]);
michael@0 2438 s[9] = _mm_add_epi16(x[9], x[11]);
michael@0 2439 s[10] = _mm_sub_epi16(x[8], x[10]);
michael@0 2440 s[11] = _mm_sub_epi16(x[9], x[11]);
michael@0 2441 s[12] = _mm_packs_epi32(v[8], v[9]);
michael@0 2442 s[13] = _mm_packs_epi32(v[10], v[11]);
michael@0 2443 s[14] = _mm_packs_epi32(v[12], v[13]);
michael@0 2444 s[15] = _mm_packs_epi32(v[14], v[15]);
michael@0 2445
michael@0 2446 // stage 4
michael@0 2447 u[0] = _mm_unpacklo_epi16(s[2], s[3]);
michael@0 2448 u[1] = _mm_unpackhi_epi16(s[2], s[3]);
michael@0 2449 u[2] = _mm_unpacklo_epi16(s[6], s[7]);
michael@0 2450 u[3] = _mm_unpackhi_epi16(s[6], s[7]);
michael@0 2451 u[4] = _mm_unpacklo_epi16(s[10], s[11]);
michael@0 2452 u[5] = _mm_unpackhi_epi16(s[10], s[11]);
michael@0 2453 u[6] = _mm_unpacklo_epi16(s[14], s[15]);
michael@0 2454 u[7] = _mm_unpackhi_epi16(s[14], s[15]);
michael@0 2455
michael@0 2456 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
michael@0 2457 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
michael@0 2458 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
michael@0 2459 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
michael@0 2460 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
michael@0 2461 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
michael@0 2462 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
michael@0 2463 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
michael@0 2464 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
michael@0 2465 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
michael@0 2466 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
michael@0 2467 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
michael@0 2468 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
michael@0 2469 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
michael@0 2470 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
michael@0 2471 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
michael@0 2472
michael@0 2473 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
michael@0 2474 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
michael@0 2475 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
michael@0 2476 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
michael@0 2477 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
michael@0 2478 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
michael@0 2479 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
michael@0 2480 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
michael@0 2481 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
michael@0 2482 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
michael@0 2483 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
michael@0 2484 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
michael@0 2485 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
michael@0 2486 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
michael@0 2487 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
michael@0 2488 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
michael@0 2489
michael@0 2490 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
michael@0 2491 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
michael@0 2492 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
michael@0 2493 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
michael@0 2494 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
michael@0 2495 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
michael@0 2496 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
michael@0 2497 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
michael@0 2498 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
michael@0 2499 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
michael@0 2500 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
michael@0 2501 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
michael@0 2502 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
michael@0 2503 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
michael@0 2504 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
michael@0 2505 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
michael@0 2506
michael@0 2507 in[0] = s[0];
michael@0 2508 in[1] = _mm_sub_epi16(kZero, s[8]);
michael@0 2509 in[2] = s[12];
michael@0 2510 in[3] = _mm_sub_epi16(kZero, s[4]);
michael@0 2511 in[4] = _mm_packs_epi32(v[4], v[5]);
michael@0 2512 in[5] = _mm_packs_epi32(v[12], v[13]);
michael@0 2513 in[6] = _mm_packs_epi32(v[8], v[9]);
michael@0 2514 in[7] = _mm_packs_epi32(v[0], v[1]);
michael@0 2515 in[8] = _mm_packs_epi32(v[2], v[3]);
michael@0 2516 in[9] = _mm_packs_epi32(v[10], v[11]);
michael@0 2517 in[10] = _mm_packs_epi32(v[14], v[15]);
michael@0 2518 in[11] = _mm_packs_epi32(v[6], v[7]);
michael@0 2519 in[12] = s[5];
michael@0 2520 in[13] = _mm_sub_epi16(kZero, s[13]);
michael@0 2521 in[14] = s[9];
michael@0 2522 in[15] = _mm_sub_epi16(kZero, s[1]);
michael@0 2523 }
michael@0 2524
michael@0 2525 void fdct16_1d_sse2(__m128i *in0, __m128i *in1) {
michael@0 2526 fdct16_1d_8col(in0);
michael@0 2527 fdct16_1d_8col(in1);
michael@0 2528 array_transpose_16x16(in0, in1);
michael@0 2529 }
michael@0 2530
michael@0 2531 void fadst16_1d_sse2(__m128i *in0, __m128i *in1) {
michael@0 2532 fadst16_1d_8col(in0);
michael@0 2533 fadst16_1d_8col(in1);
michael@0 2534 array_transpose_16x16(in0, in1);
michael@0 2535 }
michael@0 2536
michael@0 2537 void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output,
michael@0 2538 int stride, int tx_type) {
michael@0 2539 __m128i in0[16], in1[16];
michael@0 2540 load_buffer_16x16(input, in0, in1, stride);
michael@0 2541 switch (tx_type) {
michael@0 2542 case 0: // DCT_DCT
michael@0 2543 fdct16_1d_sse2(in0, in1);
michael@0 2544 right_shift_16x16(in0, in1);
michael@0 2545 fdct16_1d_sse2(in0, in1);
michael@0 2546 break;
michael@0 2547 case 1: // ADST_DCT
michael@0 2548 fadst16_1d_sse2(in0, in1);
michael@0 2549 right_shift_16x16(in0, in1);
michael@0 2550 fdct16_1d_sse2(in0, in1);
michael@0 2551 break;
michael@0 2552 case 2: // DCT_ADST
michael@0 2553 fdct16_1d_sse2(in0, in1);
michael@0 2554 right_shift_16x16(in0, in1);
michael@0 2555 fadst16_1d_sse2(in0, in1);
michael@0 2556 break;
michael@0 2557 case 3: // ADST_ADST
michael@0 2558 fadst16_1d_sse2(in0, in1);
michael@0 2559 right_shift_16x16(in0, in1);
michael@0 2560 fadst16_1d_sse2(in0, in1);
michael@0 2561 break;
michael@0 2562 default:
michael@0 2563 assert(0);
michael@0 2564 break;
michael@0 2565 }
michael@0 2566 write_buffer_16x16(output, in0, in1, 16);
michael@0 2567 }
michael@0 2568
michael@0 2569 #define FDCT32x32_2D vp9_fdct32x32_rd_sse2
michael@0 2570 #define FDCT32x32_HIGH_PRECISION 0
michael@0 2571 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
michael@0 2572 #undef FDCT32x32_2D
michael@0 2573 #undef FDCT32x32_HIGH_PRECISION
michael@0 2574
michael@0 2575 #define FDCT32x32_2D vp9_fdct32x32_sse2
michael@0 2576 #define FDCT32x32_HIGH_PRECISION 1
michael@0 2577 #include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT
michael@0 2578 #undef FDCT32x32_2D
michael@0 2579 #undef FDCT32x32_HIGH_PRECISION

mercurial