1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_dct_sse2.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,2579 @@ 1.4 +/* 1.5 + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include <emmintrin.h> // SSE2 1.15 +#include "vp9/common/vp9_idct.h" // for cospi constants 1.16 +#include "vpx_ports/mem.h" 1.17 + 1.18 +void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) { 1.19 + // The 2D transform is done with two passes which are actually pretty 1.20 + // similar. In the first one, we transform the columns and transpose 1.21 + // the results. In the second one, we transform the rows. To achieve that, 1.22 + // as the first pass results are transposed, we tranpose the columns (that 1.23 + // is the transposed rows) and transpose the results (so that it goes back 1.24 + // in normal/row positions). 1.25 + int pass; 1.26 + // Constants 1.27 + // When we use them, in one case, they are all the same. In all others 1.28 + // it's a pair of them that we need to repeat four times. This is done 1.29 + // by constructing the 32 bit constant corresponding to that pair. 1.30 + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1.31 + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.32 + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1.33 + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1.34 + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.35 + const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); 1.36 + const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); 1.37 + const __m128i kOne = _mm_set1_epi16(1); 1.38 + __m128i in0, in1, in2, in3; 1.39 + // Load inputs. 1.40 + { 1.41 + in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 1.42 + in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 1.43 + in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); 1.44 + in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); 1.45 + // x = x << 4 1.46 + in0 = _mm_slli_epi16(in0, 4); 1.47 + in1 = _mm_slli_epi16(in1, 4); 1.48 + in2 = _mm_slli_epi16(in2, 4); 1.49 + in3 = _mm_slli_epi16(in3, 4); 1.50 + // if (i == 0 && input[0]) input[0] += 1; 1.51 + { 1.52 + // The mask will only contain wether the first value is zero, all 1.53 + // other comparison will fail as something shifted by 4 (above << 4) 1.54 + // can never be equal to one. To increment in the non-zero case, we 1.55 + // add the mask and one for the first element: 1.56 + // - if zero, mask = -1, v = v - 1 + 1 = v 1.57 + // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 1.58 + __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); 1.59 + in0 = _mm_add_epi16(in0, mask); 1.60 + in0 = _mm_add_epi16(in0, k__nonzero_bias_b); 1.61 + } 1.62 + } 1.63 + // Do the two transform/transpose passes 1.64 + for (pass = 0; pass < 2; ++pass) { 1.65 + // Transform 1/2: Add/substract 1.66 + const __m128i r0 = _mm_add_epi16(in0, in3); 1.67 + const __m128i r1 = _mm_add_epi16(in1, in2); 1.68 + const __m128i r2 = _mm_sub_epi16(in1, in2); 1.69 + const __m128i r3 = _mm_sub_epi16(in0, in3); 1.70 + // Transform 1/2: Interleave to do the multiply by constants which gets us 1.71 + // into 32 bits. 1.72 + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 1.73 + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 1.74 + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 1.75 + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 1.76 + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 1.77 + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 1.78 + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1.79 + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1.80 + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 1.81 + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 1.82 + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1.83 + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1.84 + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1.85 + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1.86 + // Combine and transpose 1.87 + const __m128i res0 = _mm_packs_epi32(w0, w2); 1.88 + const __m128i res1 = _mm_packs_epi32(w4, w6); 1.89 + // 00 01 02 03 20 21 22 23 1.90 + // 10 11 12 13 30 31 32 33 1.91 + const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); 1.92 + const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1); 1.93 + // 00 10 01 11 02 12 03 13 1.94 + // 20 30 21 31 22 32 23 33 1.95 + in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 1.96 + in2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 1.97 + // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1 1.98 + // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3 1.99 + if (0 == pass) { 1.100 + // Extract values in the high part for second pass as transform code 1.101 + // only uses the first four values. 1.102 + in1 = _mm_unpackhi_epi64(in0, in0); 1.103 + in3 = _mm_unpackhi_epi64(in2, in2); 1.104 + } else { 1.105 + // Post-condition output and store it (v + 1) >> 2, taking advantage 1.106 + // of the fact 1/3 are stored just after 0/2. 1.107 + __m128i out01 = _mm_add_epi16(in0, kOne); 1.108 + __m128i out23 = _mm_add_epi16(in2, kOne); 1.109 + out01 = _mm_srai_epi16(out01, 2); 1.110 + out23 = _mm_srai_epi16(out23, 2); 1.111 + _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); 1.112 + _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); 1.113 + } 1.114 + } 1.115 +} 1.116 + 1.117 +static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in, 1.118 + int stride) { 1.119 + const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); 1.120 + const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); 1.121 + __m128i mask; 1.122 + 1.123 + in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); 1.124 + in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); 1.125 + in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); 1.126 + in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); 1.127 + 1.128 + in[0] = _mm_slli_epi16(in[0], 4); 1.129 + in[1] = _mm_slli_epi16(in[1], 4); 1.130 + in[2] = _mm_slli_epi16(in[2], 4); 1.131 + in[3] = _mm_slli_epi16(in[3], 4); 1.132 + 1.133 + mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a); 1.134 + in[0] = _mm_add_epi16(in[0], mask); 1.135 + in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b); 1.136 +} 1.137 + 1.138 +static INLINE void write_buffer_4x4(int16_t *output, __m128i *res) { 1.139 + const __m128i kOne = _mm_set1_epi16(1); 1.140 + __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]); 1.141 + __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]); 1.142 + __m128i out01 = _mm_add_epi16(in01, kOne); 1.143 + __m128i out23 = _mm_add_epi16(in23, kOne); 1.144 + out01 = _mm_srai_epi16(out01, 2); 1.145 + out23 = _mm_srai_epi16(out23, 2); 1.146 + _mm_store_si128((__m128i *)(output + 0 * 8), out01); 1.147 + _mm_store_si128((__m128i *)(output + 1 * 8), out23); 1.148 +} 1.149 + 1.150 +static INLINE void transpose_4x4(__m128i *res) { 1.151 + // Combine and transpose 1.152 + // 00 01 02 03 20 21 22 23 1.153 + // 10 11 12 13 30 31 32 33 1.154 + const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 1.155 + const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); 1.156 + 1.157 + // 00 10 01 11 02 12 03 13 1.158 + // 20 30 21 31 22 32 23 33 1.159 + res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); 1.160 + res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); 1.161 + 1.162 + // 00 10 20 30 01 11 21 31 1.163 + // 02 12 22 32 03 13 23 33 1.164 + // only use the first 4 16-bit integers 1.165 + res[1] = _mm_unpackhi_epi64(res[0], res[0]); 1.166 + res[3] = _mm_unpackhi_epi64(res[2], res[2]); 1.167 +} 1.168 + 1.169 +void fdct4_1d_sse2(__m128i *in) { 1.170 + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1.171 + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.172 + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1.173 + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1.174 + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.175 + 1.176 + __m128i u[4], v[4]; 1.177 + u[0]=_mm_unpacklo_epi16(in[0], in[1]); 1.178 + u[1]=_mm_unpacklo_epi16(in[3], in[2]); 1.179 + 1.180 + v[0] = _mm_add_epi16(u[0], u[1]); 1.181 + v[1] = _mm_sub_epi16(u[0], u[1]); 1.182 + 1.183 + u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 1.184 + u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 1.185 + u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 1.186 + u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 1.187 + 1.188 + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1.189 + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1.190 + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1.191 + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1.192 + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1.193 + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1.194 + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1.195 + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1.196 + 1.197 + in[0] = _mm_packs_epi32(u[0], u[1]); 1.198 + in[1] = _mm_packs_epi32(u[2], u[3]); 1.199 + transpose_4x4(in); 1.200 +} 1.201 + 1.202 +void fadst4_1d_sse2(__m128i *in) { 1.203 + const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); 1.204 + const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); 1.205 + const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); 1.206 + const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); 1.207 + const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); 1.208 + const __m128i kZero = _mm_set1_epi16(0); 1.209 + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.210 + __m128i u[8], v[8]; 1.211 + __m128i in7 = _mm_add_epi16(in[0], in[1]); 1.212 + 1.213 + u[0] = _mm_unpacklo_epi16(in[0], in[1]); 1.214 + u[1] = _mm_unpacklo_epi16(in[2], in[3]); 1.215 + u[2] = _mm_unpacklo_epi16(in7, kZero); 1.216 + u[3] = _mm_unpacklo_epi16(in[2], kZero); 1.217 + u[4] = _mm_unpacklo_epi16(in[3], kZero); 1.218 + 1.219 + v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 1.220 + v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 1.221 + v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1 1.222 + v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 1.223 + v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 1.224 + v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 1.225 + v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03); 1.226 + 1.227 + u[0] = _mm_add_epi32(v[0], v[1]); 1.228 + u[1] = _mm_sub_epi32(v[2], v[6]); 1.229 + u[2] = _mm_add_epi32(v[3], v[4]); 1.230 + u[3] = _mm_sub_epi32(u[2], u[0]); 1.231 + u[4] = _mm_slli_epi32(v[5], 2); 1.232 + u[5] = _mm_sub_epi32(u[4], v[5]); 1.233 + u[6] = _mm_add_epi32(u[3], u[5]); 1.234 + 1.235 + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1.236 + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1.237 + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1.238 + v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1.239 + 1.240 + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1.241 + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1.242 + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1.243 + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1.244 + 1.245 + in[0] = _mm_packs_epi32(u[0], u[2]); 1.246 + in[1] = _mm_packs_epi32(u[1], u[3]); 1.247 + transpose_4x4(in); 1.248 +} 1.249 + 1.250 +void vp9_short_fht4x4_sse2(const int16_t *input, int16_t *output, 1.251 + int stride, int tx_type) { 1.252 + __m128i in[4]; 1.253 + load_buffer_4x4(input, in, stride); 1.254 + switch (tx_type) { 1.255 + case 0: // DCT_DCT 1.256 + fdct4_1d_sse2(in); 1.257 + fdct4_1d_sse2(in); 1.258 + break; 1.259 + case 1: // ADST_DCT 1.260 + fadst4_1d_sse2(in); 1.261 + fdct4_1d_sse2(in); 1.262 + break; 1.263 + case 2: // DCT_ADST 1.264 + fdct4_1d_sse2(in); 1.265 + fadst4_1d_sse2(in); 1.266 + break; 1.267 + case 3: // ADST_ADST 1.268 + fadst4_1d_sse2(in); 1.269 + fadst4_1d_sse2(in); 1.270 + break; 1.271 + default: 1.272 + assert(0); 1.273 + break; 1.274 + } 1.275 + write_buffer_4x4(output, in); 1.276 +} 1.277 + 1.278 +void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) { 1.279 + int pass; 1.280 + // Constants 1.281 + // When we use them, in one case, they are all the same. In all others 1.282 + // it's a pair of them that we need to repeat four times. This is done 1.283 + // by constructing the 32 bit constant corresponding to that pair. 1.284 + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1.285 + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.286 + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1.287 + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1.288 + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 1.289 + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1.290 + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 1.291 + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1.292 + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.293 + // Load input 1.294 + __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); 1.295 + __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); 1.296 + __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); 1.297 + __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); 1.298 + __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); 1.299 + __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); 1.300 + __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); 1.301 + __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); 1.302 + // Pre-condition input (shift by two) 1.303 + in0 = _mm_slli_epi16(in0, 2); 1.304 + in1 = _mm_slli_epi16(in1, 2); 1.305 + in2 = _mm_slli_epi16(in2, 2); 1.306 + in3 = _mm_slli_epi16(in3, 2); 1.307 + in4 = _mm_slli_epi16(in4, 2); 1.308 + in5 = _mm_slli_epi16(in5, 2); 1.309 + in6 = _mm_slli_epi16(in6, 2); 1.310 + in7 = _mm_slli_epi16(in7, 2); 1.311 + 1.312 + // We do two passes, first the columns, then the rows. The results of the 1.313 + // first pass are transposed so that the same column code can be reused. The 1.314 + // results of the second pass are also transposed so that the rows (processed 1.315 + // as columns) are put back in row positions. 1.316 + for (pass = 0; pass < 2; pass++) { 1.317 + // To store results of each pass before the transpose. 1.318 + __m128i res0, res1, res2, res3, res4, res5, res6, res7; 1.319 + // Add/substract 1.320 + const __m128i q0 = _mm_add_epi16(in0, in7); 1.321 + const __m128i q1 = _mm_add_epi16(in1, in6); 1.322 + const __m128i q2 = _mm_add_epi16(in2, in5); 1.323 + const __m128i q3 = _mm_add_epi16(in3, in4); 1.324 + const __m128i q4 = _mm_sub_epi16(in3, in4); 1.325 + const __m128i q5 = _mm_sub_epi16(in2, in5); 1.326 + const __m128i q6 = _mm_sub_epi16(in1, in6); 1.327 + const __m128i q7 = _mm_sub_epi16(in0, in7); 1.328 + // Work on first four results 1.329 + { 1.330 + // Add/substract 1.331 + const __m128i r0 = _mm_add_epi16(q0, q3); 1.332 + const __m128i r1 = _mm_add_epi16(q1, q2); 1.333 + const __m128i r2 = _mm_sub_epi16(q1, q2); 1.334 + const __m128i r3 = _mm_sub_epi16(q0, q3); 1.335 + // Interleave to do the multiply by constants which gets us into 32bits 1.336 + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 1.337 + const __m128i t1 = _mm_unpackhi_epi16(r0, r1); 1.338 + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 1.339 + const __m128i t3 = _mm_unpackhi_epi16(r2, r3); 1.340 + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 1.341 + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 1.342 + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 1.343 + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); 1.344 + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 1.345 + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); 1.346 + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 1.347 + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); 1.348 + // dct_const_round_shift 1.349 + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1.350 + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1.351 + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1.352 + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1.353 + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 1.354 + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 1.355 + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 1.356 + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 1.357 + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1.358 + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1.359 + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1.360 + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1.361 + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1.362 + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1.363 + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1.364 + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1.365 + // Combine 1.366 + res0 = _mm_packs_epi32(w0, w1); 1.367 + res4 = _mm_packs_epi32(w2, w3); 1.368 + res2 = _mm_packs_epi32(w4, w5); 1.369 + res6 = _mm_packs_epi32(w6, w7); 1.370 + } 1.371 + // Work on next four results 1.372 + { 1.373 + // Interleave to do the multiply by constants which gets us into 32bits 1.374 + const __m128i d0 = _mm_unpacklo_epi16(q6, q5); 1.375 + const __m128i d1 = _mm_unpackhi_epi16(q6, q5); 1.376 + const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); 1.377 + const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); 1.378 + const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); 1.379 + const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); 1.380 + // dct_const_round_shift 1.381 + const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); 1.382 + const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); 1.383 + const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); 1.384 + const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); 1.385 + const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); 1.386 + const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); 1.387 + const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); 1.388 + const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); 1.389 + // Combine 1.390 + const __m128i r0 = _mm_packs_epi32(s0, s1); 1.391 + const __m128i r1 = _mm_packs_epi32(s2, s3); 1.392 + // Add/substract 1.393 + const __m128i x0 = _mm_add_epi16(q4, r0); 1.394 + const __m128i x1 = _mm_sub_epi16(q4, r0); 1.395 + const __m128i x2 = _mm_sub_epi16(q7, r1); 1.396 + const __m128i x3 = _mm_add_epi16(q7, r1); 1.397 + // Interleave to do the multiply by constants which gets us into 32bits 1.398 + const __m128i t0 = _mm_unpacklo_epi16(x0, x3); 1.399 + const __m128i t1 = _mm_unpackhi_epi16(x0, x3); 1.400 + const __m128i t2 = _mm_unpacklo_epi16(x1, x2); 1.401 + const __m128i t3 = _mm_unpackhi_epi16(x1, x2); 1.402 + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); 1.403 + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); 1.404 + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); 1.405 + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); 1.406 + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); 1.407 + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); 1.408 + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); 1.409 + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); 1.410 + // dct_const_round_shift 1.411 + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1.412 + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1.413 + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1.414 + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1.415 + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 1.416 + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 1.417 + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 1.418 + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 1.419 + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1.420 + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1.421 + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1.422 + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1.423 + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1.424 + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1.425 + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1.426 + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1.427 + // Combine 1.428 + res1 = _mm_packs_epi32(w0, w1); 1.429 + res7 = _mm_packs_epi32(w2, w3); 1.430 + res5 = _mm_packs_epi32(w4, w5); 1.431 + res3 = _mm_packs_epi32(w6, w7); 1.432 + } 1.433 + // Transpose the 8x8. 1.434 + { 1.435 + // 00 01 02 03 04 05 06 07 1.436 + // 10 11 12 13 14 15 16 17 1.437 + // 20 21 22 23 24 25 26 27 1.438 + // 30 31 32 33 34 35 36 37 1.439 + // 40 41 42 43 44 45 46 47 1.440 + // 50 51 52 53 54 55 56 57 1.441 + // 60 61 62 63 64 65 66 67 1.442 + // 70 71 72 73 74 75 76 77 1.443 + const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); 1.444 + const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); 1.445 + const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); 1.446 + const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); 1.447 + const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); 1.448 + const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); 1.449 + const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); 1.450 + const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); 1.451 + // 00 10 01 11 02 12 03 13 1.452 + // 20 30 21 31 22 32 23 33 1.453 + // 04 14 05 15 06 16 07 17 1.454 + // 24 34 25 35 26 36 27 37 1.455 + // 40 50 41 51 42 52 43 53 1.456 + // 60 70 61 71 62 72 63 73 1.457 + // 54 54 55 55 56 56 57 57 1.458 + // 64 74 65 75 66 76 67 77 1.459 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 1.460 + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 1.461 + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 1.462 + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 1.463 + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 1.464 + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 1.465 + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 1.466 + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 1.467 + // 00 10 20 30 01 11 21 31 1.468 + // 40 50 60 70 41 51 61 71 1.469 + // 02 12 22 32 03 13 23 33 1.470 + // 42 52 62 72 43 53 63 73 1.471 + // 04 14 24 34 05 15 21 36 1.472 + // 44 54 64 74 45 55 61 76 1.473 + // 06 16 26 36 07 17 27 37 1.474 + // 46 56 66 76 47 57 67 77 1.475 + in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 1.476 + in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 1.477 + in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 1.478 + in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 1.479 + in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 1.480 + in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 1.481 + in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 1.482 + in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 1.483 + // 00 10 20 30 40 50 60 70 1.484 + // 01 11 21 31 41 51 61 71 1.485 + // 02 12 22 32 42 52 62 72 1.486 + // 03 13 23 33 43 53 63 73 1.487 + // 04 14 24 34 44 54 64 74 1.488 + // 05 15 25 35 45 55 65 75 1.489 + // 06 16 26 36 46 56 66 76 1.490 + // 07 17 27 37 47 57 67 77 1.491 + } 1.492 + } 1.493 + // Post-condition output and store it 1.494 + { 1.495 + // Post-condition (division by two) 1.496 + // division of two 16 bits signed numbers using shifts 1.497 + // n / 2 = (n - (n >> 15)) >> 1 1.498 + const __m128i sign_in0 = _mm_srai_epi16(in0, 15); 1.499 + const __m128i sign_in1 = _mm_srai_epi16(in1, 15); 1.500 + const __m128i sign_in2 = _mm_srai_epi16(in2, 15); 1.501 + const __m128i sign_in3 = _mm_srai_epi16(in3, 15); 1.502 + const __m128i sign_in4 = _mm_srai_epi16(in4, 15); 1.503 + const __m128i sign_in5 = _mm_srai_epi16(in5, 15); 1.504 + const __m128i sign_in6 = _mm_srai_epi16(in6, 15); 1.505 + const __m128i sign_in7 = _mm_srai_epi16(in7, 15); 1.506 + in0 = _mm_sub_epi16(in0, sign_in0); 1.507 + in1 = _mm_sub_epi16(in1, sign_in1); 1.508 + in2 = _mm_sub_epi16(in2, sign_in2); 1.509 + in3 = _mm_sub_epi16(in3, sign_in3); 1.510 + in4 = _mm_sub_epi16(in4, sign_in4); 1.511 + in5 = _mm_sub_epi16(in5, sign_in5); 1.512 + in6 = _mm_sub_epi16(in6, sign_in6); 1.513 + in7 = _mm_sub_epi16(in7, sign_in7); 1.514 + in0 = _mm_srai_epi16(in0, 1); 1.515 + in1 = _mm_srai_epi16(in1, 1); 1.516 + in2 = _mm_srai_epi16(in2, 1); 1.517 + in3 = _mm_srai_epi16(in3, 1); 1.518 + in4 = _mm_srai_epi16(in4, 1); 1.519 + in5 = _mm_srai_epi16(in5, 1); 1.520 + in6 = _mm_srai_epi16(in6, 1); 1.521 + in7 = _mm_srai_epi16(in7, 1); 1.522 + // store results 1.523 + _mm_store_si128((__m128i *)(output + 0 * 8), in0); 1.524 + _mm_store_si128((__m128i *)(output + 1 * 8), in1); 1.525 + _mm_store_si128((__m128i *)(output + 2 * 8), in2); 1.526 + _mm_store_si128((__m128i *)(output + 3 * 8), in3); 1.527 + _mm_store_si128((__m128i *)(output + 4 * 8), in4); 1.528 + _mm_store_si128((__m128i *)(output + 5 * 8), in5); 1.529 + _mm_store_si128((__m128i *)(output + 6 * 8), in6); 1.530 + _mm_store_si128((__m128i *)(output + 7 * 8), in7); 1.531 + } 1.532 +} 1.533 + 1.534 +// load 8x8 array 1.535 +static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in, 1.536 + int stride) { 1.537 + in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); 1.538 + in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); 1.539 + in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); 1.540 + in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); 1.541 + in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); 1.542 + in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); 1.543 + in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); 1.544 + in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); 1.545 + 1.546 + in[0] = _mm_slli_epi16(in[0], 2); 1.547 + in[1] = _mm_slli_epi16(in[1], 2); 1.548 + in[2] = _mm_slli_epi16(in[2], 2); 1.549 + in[3] = _mm_slli_epi16(in[3], 2); 1.550 + in[4] = _mm_slli_epi16(in[4], 2); 1.551 + in[5] = _mm_slli_epi16(in[5], 2); 1.552 + in[6] = _mm_slli_epi16(in[6], 2); 1.553 + in[7] = _mm_slli_epi16(in[7], 2); 1.554 +} 1.555 + 1.556 +// right shift and rounding 1.557 +static INLINE void right_shift_8x8(__m128i *res, int const bit) { 1.558 + const __m128i kOne = _mm_set1_epi16(1); 1.559 + const int bit_m02 = bit - 2; 1.560 + __m128i sign0 = _mm_srai_epi16(res[0], 15); 1.561 + __m128i sign1 = _mm_srai_epi16(res[1], 15); 1.562 + __m128i sign2 = _mm_srai_epi16(res[2], 15); 1.563 + __m128i sign3 = _mm_srai_epi16(res[3], 15); 1.564 + __m128i sign4 = _mm_srai_epi16(res[4], 15); 1.565 + __m128i sign5 = _mm_srai_epi16(res[5], 15); 1.566 + __m128i sign6 = _mm_srai_epi16(res[6], 15); 1.567 + __m128i sign7 = _mm_srai_epi16(res[7], 15); 1.568 + 1.569 + if (bit_m02 >= 0) { 1.570 + __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02); 1.571 + res[0] = _mm_add_epi16(res[0], k_const_rounding); 1.572 + res[1] = _mm_add_epi16(res[1], k_const_rounding); 1.573 + res[2] = _mm_add_epi16(res[2], k_const_rounding); 1.574 + res[3] = _mm_add_epi16(res[3], k_const_rounding); 1.575 + res[4] = _mm_add_epi16(res[4], k_const_rounding); 1.576 + res[5] = _mm_add_epi16(res[5], k_const_rounding); 1.577 + res[6] = _mm_add_epi16(res[6], k_const_rounding); 1.578 + res[7] = _mm_add_epi16(res[7], k_const_rounding); 1.579 + } 1.580 + 1.581 + res[0] = _mm_sub_epi16(res[0], sign0); 1.582 + res[1] = _mm_sub_epi16(res[1], sign1); 1.583 + res[2] = _mm_sub_epi16(res[2], sign2); 1.584 + res[3] = _mm_sub_epi16(res[3], sign3); 1.585 + res[4] = _mm_sub_epi16(res[4], sign4); 1.586 + res[5] = _mm_sub_epi16(res[5], sign5); 1.587 + res[6] = _mm_sub_epi16(res[6], sign6); 1.588 + res[7] = _mm_sub_epi16(res[7], sign7); 1.589 + 1.590 + res[0] = _mm_srai_epi16(res[0], bit); 1.591 + res[1] = _mm_srai_epi16(res[1], bit); 1.592 + res[2] = _mm_srai_epi16(res[2], bit); 1.593 + res[3] = _mm_srai_epi16(res[3], bit); 1.594 + res[4] = _mm_srai_epi16(res[4], bit); 1.595 + res[5] = _mm_srai_epi16(res[5], bit); 1.596 + res[6] = _mm_srai_epi16(res[6], bit); 1.597 + res[7] = _mm_srai_epi16(res[7], bit); 1.598 +} 1.599 + 1.600 +// write 8x8 array 1.601 +static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) { 1.602 + _mm_store_si128((__m128i *)(output + 0 * stride), res[0]); 1.603 + _mm_store_si128((__m128i *)(output + 1 * stride), res[1]); 1.604 + _mm_store_si128((__m128i *)(output + 2 * stride), res[2]); 1.605 + _mm_store_si128((__m128i *)(output + 3 * stride), res[3]); 1.606 + _mm_store_si128((__m128i *)(output + 4 * stride), res[4]); 1.607 + _mm_store_si128((__m128i *)(output + 5 * stride), res[5]); 1.608 + _mm_store_si128((__m128i *)(output + 6 * stride), res[6]); 1.609 + _mm_store_si128((__m128i *)(output + 7 * stride), res[7]); 1.610 +} 1.611 + 1.612 +// perform in-place transpose 1.613 +static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { 1.614 + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); 1.615 + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); 1.616 + const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); 1.617 + const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); 1.618 + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); 1.619 + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); 1.620 + const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); 1.621 + const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); 1.622 + // 00 10 01 11 02 12 03 13 1.623 + // 20 30 21 31 22 32 23 33 1.624 + // 04 14 05 15 06 16 07 17 1.625 + // 24 34 25 35 26 36 27 37 1.626 + // 40 50 41 51 42 52 43 53 1.627 + // 60 70 61 71 62 72 63 73 1.628 + // 44 54 45 55 46 56 47 57 1.629 + // 64 74 65 75 66 76 67 77 1.630 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 1.631 + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); 1.632 + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 1.633 + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); 1.634 + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); 1.635 + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 1.636 + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); 1.637 + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 1.638 + // 00 10 20 30 01 11 21 31 1.639 + // 40 50 60 70 41 51 61 71 1.640 + // 02 12 22 32 03 13 23 33 1.641 + // 42 52 62 72 43 53 63 73 1.642 + // 04 14 24 34 05 15 25 35 1.643 + // 44 54 64 74 45 55 65 75 1.644 + // 06 16 26 36 07 17 27 37 1.645 + // 46 56 66 76 47 57 67 77 1.646 + res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); 1.647 + res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); 1.648 + res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); 1.649 + res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); 1.650 + res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); 1.651 + res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); 1.652 + res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); 1.653 + res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); 1.654 + // 00 10 20 30 40 50 60 70 1.655 + // 01 11 21 31 41 51 61 71 1.656 + // 02 12 22 32 42 52 62 72 1.657 + // 03 13 23 33 43 53 63 73 1.658 + // 04 14 24 34 44 54 64 74 1.659 + // 05 15 25 35 45 55 65 75 1.660 + // 06 16 26 36 46 56 66 76 1.661 + // 07 17 27 37 47 57 67 77 1.662 +} 1.663 + 1.664 +void fdct8_1d_sse2(__m128i *in) { 1.665 + // constants 1.666 + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1.667 + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.668 + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1.669 + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1.670 + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 1.671 + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1.672 + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 1.673 + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1.674 + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.675 + __m128i u0, u1, u2, u3, u4, u5, u6, u7; 1.676 + __m128i v0, v1, v2, v3, v4, v5, v6, v7; 1.677 + __m128i s0, s1, s2, s3, s4, s5, s6, s7; 1.678 + 1.679 + // stage 1 1.680 + s0 = _mm_add_epi16(in[0], in[7]); 1.681 + s1 = _mm_add_epi16(in[1], in[6]); 1.682 + s2 = _mm_add_epi16(in[2], in[5]); 1.683 + s3 = _mm_add_epi16(in[3], in[4]); 1.684 + s4 = _mm_sub_epi16(in[3], in[4]); 1.685 + s5 = _mm_sub_epi16(in[2], in[5]); 1.686 + s6 = _mm_sub_epi16(in[1], in[6]); 1.687 + s7 = _mm_sub_epi16(in[0], in[7]); 1.688 + 1.689 + u0 = _mm_add_epi16(s0, s3); 1.690 + u1 = _mm_add_epi16(s1, s2); 1.691 + u2 = _mm_sub_epi16(s1, s2); 1.692 + u3 = _mm_sub_epi16(s0, s3); 1.693 + // interleave and perform butterfly multiplication/addition 1.694 + v0 = _mm_unpacklo_epi16(u0, u1); 1.695 + v1 = _mm_unpackhi_epi16(u0, u1); 1.696 + v2 = _mm_unpacklo_epi16(u2, u3); 1.697 + v3 = _mm_unpackhi_epi16(u2, u3); 1.698 + 1.699 + u0 = _mm_madd_epi16(v0, k__cospi_p16_p16); 1.700 + u1 = _mm_madd_epi16(v1, k__cospi_p16_p16); 1.701 + u2 = _mm_madd_epi16(v0, k__cospi_p16_m16); 1.702 + u3 = _mm_madd_epi16(v1, k__cospi_p16_m16); 1.703 + u4 = _mm_madd_epi16(v2, k__cospi_p24_p08); 1.704 + u5 = _mm_madd_epi16(v3, k__cospi_p24_p08); 1.705 + u6 = _mm_madd_epi16(v2, k__cospi_m08_p24); 1.706 + u7 = _mm_madd_epi16(v3, k__cospi_m08_p24); 1.707 + 1.708 + // shift and rounding 1.709 + v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1.710 + v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1.711 + v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1.712 + v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1.713 + v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 1.714 + v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 1.715 + v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 1.716 + v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 1.717 + 1.718 + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1.719 + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1.720 + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1.721 + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1.722 + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1.723 + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1.724 + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1.725 + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1.726 + 1.727 + in[0] = _mm_packs_epi32(u0, u1); 1.728 + in[2] = _mm_packs_epi32(u4, u5); 1.729 + in[4] = _mm_packs_epi32(u2, u3); 1.730 + in[6] = _mm_packs_epi32(u6, u7); 1.731 + 1.732 + // stage 2 1.733 + // interleave and perform butterfly multiplication/addition 1.734 + u0 = _mm_unpacklo_epi16(s6, s5); 1.735 + u1 = _mm_unpackhi_epi16(s6, s5); 1.736 + v0 = _mm_madd_epi16(u0, k__cospi_p16_m16); 1.737 + v1 = _mm_madd_epi16(u1, k__cospi_p16_m16); 1.738 + v2 = _mm_madd_epi16(u0, k__cospi_p16_p16); 1.739 + v3 = _mm_madd_epi16(u1, k__cospi_p16_p16); 1.740 + 1.741 + // shift and rounding 1.742 + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 1.743 + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 1.744 + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 1.745 + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 1.746 + 1.747 + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 1.748 + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 1.749 + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 1.750 + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 1.751 + 1.752 + u0 = _mm_packs_epi32(v0, v1); 1.753 + u1 = _mm_packs_epi32(v2, v3); 1.754 + 1.755 + // stage 3 1.756 + s0 = _mm_add_epi16(s4, u0); 1.757 + s1 = _mm_sub_epi16(s4, u0); 1.758 + s2 = _mm_sub_epi16(s7, u1); 1.759 + s3 = _mm_add_epi16(s7, u1); 1.760 + 1.761 + // stage 4 1.762 + u0 = _mm_unpacklo_epi16(s0, s3); 1.763 + u1 = _mm_unpackhi_epi16(s0, s3); 1.764 + u2 = _mm_unpacklo_epi16(s1, s2); 1.765 + u3 = _mm_unpackhi_epi16(s1, s2); 1.766 + 1.767 + v0 = _mm_madd_epi16(u0, k__cospi_p28_p04); 1.768 + v1 = _mm_madd_epi16(u1, k__cospi_p28_p04); 1.769 + v2 = _mm_madd_epi16(u2, k__cospi_p12_p20); 1.770 + v3 = _mm_madd_epi16(u3, k__cospi_p12_p20); 1.771 + v4 = _mm_madd_epi16(u2, k__cospi_m20_p12); 1.772 + v5 = _mm_madd_epi16(u3, k__cospi_m20_p12); 1.773 + v6 = _mm_madd_epi16(u0, k__cospi_m04_p28); 1.774 + v7 = _mm_madd_epi16(u1, k__cospi_m04_p28); 1.775 + 1.776 + // shift and rounding 1.777 + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 1.778 + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 1.779 + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 1.780 + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 1.781 + u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 1.782 + u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 1.783 + u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 1.784 + u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 1.785 + 1.786 + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 1.787 + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 1.788 + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 1.789 + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 1.790 + v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 1.791 + v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 1.792 + v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 1.793 + v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 1.794 + 1.795 + in[1] = _mm_packs_epi32(v0, v1); 1.796 + in[3] = _mm_packs_epi32(v4, v5); 1.797 + in[5] = _mm_packs_epi32(v2, v3); 1.798 + in[7] = _mm_packs_epi32(v6, v7); 1.799 + 1.800 + // transpose 1.801 + array_transpose_8x8(in, in); 1.802 +} 1.803 + 1.804 +void fadst8_1d_sse2(__m128i *in) { 1.805 + // Constants 1.806 + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 1.807 + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1.808 + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 1.809 + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1.810 + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 1.811 + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1.812 + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 1.813 + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1.814 + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1.815 + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1.816 + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 1.817 + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.818 + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1.819 + const __m128i k__const_0 = _mm_set1_epi16(0); 1.820 + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.821 + 1.822 + __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 1.823 + __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 1.824 + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 1.825 + __m128i s0, s1, s2, s3, s4, s5, s6, s7; 1.826 + __m128i in0, in1, in2, in3, in4, in5, in6, in7; 1.827 + 1.828 + // properly aligned for butterfly input 1.829 + in0 = in[7]; 1.830 + in1 = in[0]; 1.831 + in2 = in[5]; 1.832 + in3 = in[2]; 1.833 + in4 = in[3]; 1.834 + in5 = in[4]; 1.835 + in6 = in[1]; 1.836 + in7 = in[6]; 1.837 + 1.838 + // column transformation 1.839 + // stage 1 1.840 + // interleave and multiply/add into 32-bit integer 1.841 + s0 = _mm_unpacklo_epi16(in0, in1); 1.842 + s1 = _mm_unpackhi_epi16(in0, in1); 1.843 + s2 = _mm_unpacklo_epi16(in2, in3); 1.844 + s3 = _mm_unpackhi_epi16(in2, in3); 1.845 + s4 = _mm_unpacklo_epi16(in4, in5); 1.846 + s5 = _mm_unpackhi_epi16(in4, in5); 1.847 + s6 = _mm_unpacklo_epi16(in6, in7); 1.848 + s7 = _mm_unpackhi_epi16(in6, in7); 1.849 + 1.850 + u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 1.851 + u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 1.852 + u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 1.853 + u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 1.854 + u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 1.855 + u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 1.856 + u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 1.857 + u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 1.858 + u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 1.859 + u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 1.860 + u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 1.861 + u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 1.862 + u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 1.863 + u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 1.864 + u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 1.865 + u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 1.866 + 1.867 + // addition 1.868 + w0 = _mm_add_epi32(u0, u8); 1.869 + w1 = _mm_add_epi32(u1, u9); 1.870 + w2 = _mm_add_epi32(u2, u10); 1.871 + w3 = _mm_add_epi32(u3, u11); 1.872 + w4 = _mm_add_epi32(u4, u12); 1.873 + w5 = _mm_add_epi32(u5, u13); 1.874 + w6 = _mm_add_epi32(u6, u14); 1.875 + w7 = _mm_add_epi32(u7, u15); 1.876 + w8 = _mm_sub_epi32(u0, u8); 1.877 + w9 = _mm_sub_epi32(u1, u9); 1.878 + w10 = _mm_sub_epi32(u2, u10); 1.879 + w11 = _mm_sub_epi32(u3, u11); 1.880 + w12 = _mm_sub_epi32(u4, u12); 1.881 + w13 = _mm_sub_epi32(u5, u13); 1.882 + w14 = _mm_sub_epi32(u6, u14); 1.883 + w15 = _mm_sub_epi32(u7, u15); 1.884 + 1.885 + // shift and rounding 1.886 + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 1.887 + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 1.888 + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 1.889 + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 1.890 + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 1.891 + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 1.892 + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 1.893 + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 1.894 + v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 1.895 + v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 1.896 + v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 1.897 + v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 1.898 + v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 1.899 + v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 1.900 + v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 1.901 + v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 1.902 + 1.903 + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1.904 + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1.905 + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1.906 + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1.907 + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1.908 + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1.909 + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1.910 + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1.911 + u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 1.912 + u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 1.913 + u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 1.914 + u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 1.915 + u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 1.916 + u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 1.917 + u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 1.918 + u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 1.919 + 1.920 + // back to 16-bit and pack 8 integers into __m128i 1.921 + in[0] = _mm_packs_epi32(u0, u1); 1.922 + in[1] = _mm_packs_epi32(u2, u3); 1.923 + in[2] = _mm_packs_epi32(u4, u5); 1.924 + in[3] = _mm_packs_epi32(u6, u7); 1.925 + in[4] = _mm_packs_epi32(u8, u9); 1.926 + in[5] = _mm_packs_epi32(u10, u11); 1.927 + in[6] = _mm_packs_epi32(u12, u13); 1.928 + in[7] = _mm_packs_epi32(u14, u15); 1.929 + 1.930 + // stage 2 1.931 + s0 = _mm_add_epi16(in[0], in[2]); 1.932 + s1 = _mm_add_epi16(in[1], in[3]); 1.933 + s2 = _mm_sub_epi16(in[0], in[2]); 1.934 + s3 = _mm_sub_epi16(in[1], in[3]); 1.935 + u0 = _mm_unpacklo_epi16(in[4], in[5]); 1.936 + u1 = _mm_unpackhi_epi16(in[4], in[5]); 1.937 + u2 = _mm_unpacklo_epi16(in[6], in[7]); 1.938 + u3 = _mm_unpackhi_epi16(in[6], in[7]); 1.939 + 1.940 + v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 1.941 + v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 1.942 + v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 1.943 + v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 1.944 + v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 1.945 + v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 1.946 + v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 1.947 + v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 1.948 + 1.949 + w0 = _mm_add_epi32(v0, v4); 1.950 + w1 = _mm_add_epi32(v1, v5); 1.951 + w2 = _mm_add_epi32(v2, v6); 1.952 + w3 = _mm_add_epi32(v3, v7); 1.953 + w4 = _mm_sub_epi32(v0, v4); 1.954 + w5 = _mm_sub_epi32(v1, v5); 1.955 + w6 = _mm_sub_epi32(v2, v6); 1.956 + w7 = _mm_sub_epi32(v3, v7); 1.957 + 1.958 + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 1.959 + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 1.960 + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 1.961 + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 1.962 + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 1.963 + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 1.964 + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 1.965 + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 1.966 + 1.967 + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1.968 + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1.969 + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1.970 + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1.971 + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1.972 + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1.973 + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1.974 + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1.975 + 1.976 + // back to 16-bit intergers 1.977 + s4 = _mm_packs_epi32(u0, u1); 1.978 + s5 = _mm_packs_epi32(u2, u3); 1.979 + s6 = _mm_packs_epi32(u4, u5); 1.980 + s7 = _mm_packs_epi32(u6, u7); 1.981 + 1.982 + // stage 3 1.983 + u0 = _mm_unpacklo_epi16(s2, s3); 1.984 + u1 = _mm_unpackhi_epi16(s2, s3); 1.985 + u2 = _mm_unpacklo_epi16(s6, s7); 1.986 + u3 = _mm_unpackhi_epi16(s6, s7); 1.987 + 1.988 + v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); 1.989 + v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); 1.990 + v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); 1.991 + v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); 1.992 + v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); 1.993 + v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); 1.994 + v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); 1.995 + v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); 1.996 + 1.997 + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 1.998 + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 1.999 + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 1.1000 + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 1.1001 + u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 1.1002 + u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 1.1003 + u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 1.1004 + u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 1.1005 + 1.1006 + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 1.1007 + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 1.1008 + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 1.1009 + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 1.1010 + v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 1.1011 + v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 1.1012 + v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 1.1013 + v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 1.1014 + 1.1015 + s2 = _mm_packs_epi32(v0, v1); 1.1016 + s3 = _mm_packs_epi32(v2, v3); 1.1017 + s6 = _mm_packs_epi32(v4, v5); 1.1018 + s7 = _mm_packs_epi32(v6, v7); 1.1019 + 1.1020 + // FIXME(jingning): do subtract using bit inversion? 1.1021 + in[0] = s0; 1.1022 + in[1] = _mm_sub_epi16(k__const_0, s4); 1.1023 + in[2] = s6; 1.1024 + in[3] = _mm_sub_epi16(k__const_0, s2); 1.1025 + in[4] = s3; 1.1026 + in[5] = _mm_sub_epi16(k__const_0, s7); 1.1027 + in[6] = s5; 1.1028 + in[7] = _mm_sub_epi16(k__const_0, s1); 1.1029 + 1.1030 + // transpose 1.1031 + array_transpose_8x8(in, in); 1.1032 +} 1.1033 + 1.1034 +void vp9_short_fht8x8_sse2(const int16_t *input, int16_t *output, 1.1035 + int stride, int tx_type) { 1.1036 + __m128i in[8]; 1.1037 + load_buffer_8x8(input, in, stride); 1.1038 + switch (tx_type) { 1.1039 + case 0: // DCT_DCT 1.1040 + fdct8_1d_sse2(in); 1.1041 + fdct8_1d_sse2(in); 1.1042 + break; 1.1043 + case 1: // ADST_DCT 1.1044 + fadst8_1d_sse2(in); 1.1045 + fdct8_1d_sse2(in); 1.1046 + break; 1.1047 + case 2: // DCT_ADST 1.1048 + fdct8_1d_sse2(in); 1.1049 + fadst8_1d_sse2(in); 1.1050 + break; 1.1051 + case 3: // ADST_ADST 1.1052 + fadst8_1d_sse2(in); 1.1053 + fadst8_1d_sse2(in); 1.1054 + break; 1.1055 + default: 1.1056 + assert(0); 1.1057 + break; 1.1058 + } 1.1059 + right_shift_8x8(in, 1); 1.1060 + write_buffer_8x8(output, in, 8); 1.1061 +} 1.1062 + 1.1063 +void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) { 1.1064 + // The 2D transform is done with two passes which are actually pretty 1.1065 + // similar. In the first one, we transform the columns and transpose 1.1066 + // the results. In the second one, we transform the rows. To achieve that, 1.1067 + // as the first pass results are transposed, we tranpose the columns (that 1.1068 + // is the transposed rows) and transpose the results (so that it goes back 1.1069 + // in normal/row positions). 1.1070 + int pass; 1.1071 + // We need an intermediate buffer between passes. 1.1072 + DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); 1.1073 + const int16_t *in = input; 1.1074 + int16_t *out = intermediate; 1.1075 + // Constants 1.1076 + // When we use them, in one case, they are all the same. In all others 1.1077 + // it's a pair of them that we need to repeat four times. This is done 1.1078 + // by constructing the 32 bit constant corresponding to that pair. 1.1079 + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1.1080 + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.1081 + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1.1082 + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1.1083 + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1.1084 + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 1.1085 + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1.1086 + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 1.1087 + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1.1088 + const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); 1.1089 + const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); 1.1090 + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); 1.1091 + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); 1.1092 + const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); 1.1093 + const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); 1.1094 + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); 1.1095 + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); 1.1096 + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.1097 + const __m128i kOne = _mm_set1_epi16(1); 1.1098 + // Do the two transform/transpose passes 1.1099 + for (pass = 0; pass < 2; ++pass) { 1.1100 + // We process eight columns (transposed rows in second pass) at a time. 1.1101 + int column_start; 1.1102 + for (column_start = 0; column_start < 16; column_start += 8) { 1.1103 + __m128i in00, in01, in02, in03, in04, in05, in06, in07; 1.1104 + __m128i in08, in09, in10, in11, in12, in13, in14, in15; 1.1105 + __m128i input0, input1, input2, input3, input4, input5, input6, input7; 1.1106 + __m128i step1_0, step1_1, step1_2, step1_3; 1.1107 + __m128i step1_4, step1_5, step1_6, step1_7; 1.1108 + __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; 1.1109 + __m128i step3_0, step3_1, step3_2, step3_3; 1.1110 + __m128i step3_4, step3_5, step3_6, step3_7; 1.1111 + __m128i res00, res01, res02, res03, res04, res05, res06, res07; 1.1112 + __m128i res08, res09, res10, res11, res12, res13, res14, res15; 1.1113 + // Load and pre-condition input. 1.1114 + if (0 == pass) { 1.1115 + in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); 1.1116 + in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); 1.1117 + in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); 1.1118 + in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); 1.1119 + in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); 1.1120 + in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); 1.1121 + in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); 1.1122 + in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); 1.1123 + in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); 1.1124 + in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); 1.1125 + in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); 1.1126 + in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); 1.1127 + in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); 1.1128 + in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); 1.1129 + in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); 1.1130 + in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); 1.1131 + // x = x << 2 1.1132 + in00 = _mm_slli_epi16(in00, 2); 1.1133 + in01 = _mm_slli_epi16(in01, 2); 1.1134 + in02 = _mm_slli_epi16(in02, 2); 1.1135 + in03 = _mm_slli_epi16(in03, 2); 1.1136 + in04 = _mm_slli_epi16(in04, 2); 1.1137 + in05 = _mm_slli_epi16(in05, 2); 1.1138 + in06 = _mm_slli_epi16(in06, 2); 1.1139 + in07 = _mm_slli_epi16(in07, 2); 1.1140 + in08 = _mm_slli_epi16(in08, 2); 1.1141 + in09 = _mm_slli_epi16(in09, 2); 1.1142 + in10 = _mm_slli_epi16(in10, 2); 1.1143 + in11 = _mm_slli_epi16(in11, 2); 1.1144 + in12 = _mm_slli_epi16(in12, 2); 1.1145 + in13 = _mm_slli_epi16(in13, 2); 1.1146 + in14 = _mm_slli_epi16(in14, 2); 1.1147 + in15 = _mm_slli_epi16(in15, 2); 1.1148 + } else { 1.1149 + in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); 1.1150 + in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); 1.1151 + in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); 1.1152 + in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); 1.1153 + in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); 1.1154 + in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); 1.1155 + in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); 1.1156 + in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); 1.1157 + in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); 1.1158 + in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); 1.1159 + in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); 1.1160 + in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); 1.1161 + in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); 1.1162 + in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); 1.1163 + in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); 1.1164 + in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); 1.1165 + // x = (x + 1) >> 2 1.1166 + in00 = _mm_add_epi16(in00, kOne); 1.1167 + in01 = _mm_add_epi16(in01, kOne); 1.1168 + in02 = _mm_add_epi16(in02, kOne); 1.1169 + in03 = _mm_add_epi16(in03, kOne); 1.1170 + in04 = _mm_add_epi16(in04, kOne); 1.1171 + in05 = _mm_add_epi16(in05, kOne); 1.1172 + in06 = _mm_add_epi16(in06, kOne); 1.1173 + in07 = _mm_add_epi16(in07, kOne); 1.1174 + in08 = _mm_add_epi16(in08, kOne); 1.1175 + in09 = _mm_add_epi16(in09, kOne); 1.1176 + in10 = _mm_add_epi16(in10, kOne); 1.1177 + in11 = _mm_add_epi16(in11, kOne); 1.1178 + in12 = _mm_add_epi16(in12, kOne); 1.1179 + in13 = _mm_add_epi16(in13, kOne); 1.1180 + in14 = _mm_add_epi16(in14, kOne); 1.1181 + in15 = _mm_add_epi16(in15, kOne); 1.1182 + in00 = _mm_srai_epi16(in00, 2); 1.1183 + in01 = _mm_srai_epi16(in01, 2); 1.1184 + in02 = _mm_srai_epi16(in02, 2); 1.1185 + in03 = _mm_srai_epi16(in03, 2); 1.1186 + in04 = _mm_srai_epi16(in04, 2); 1.1187 + in05 = _mm_srai_epi16(in05, 2); 1.1188 + in06 = _mm_srai_epi16(in06, 2); 1.1189 + in07 = _mm_srai_epi16(in07, 2); 1.1190 + in08 = _mm_srai_epi16(in08, 2); 1.1191 + in09 = _mm_srai_epi16(in09, 2); 1.1192 + in10 = _mm_srai_epi16(in10, 2); 1.1193 + in11 = _mm_srai_epi16(in11, 2); 1.1194 + in12 = _mm_srai_epi16(in12, 2); 1.1195 + in13 = _mm_srai_epi16(in13, 2); 1.1196 + in14 = _mm_srai_epi16(in14, 2); 1.1197 + in15 = _mm_srai_epi16(in15, 2); 1.1198 + } 1.1199 + in += 8; 1.1200 + // Calculate input for the first 8 results. 1.1201 + { 1.1202 + input0 = _mm_add_epi16(in00, in15); 1.1203 + input1 = _mm_add_epi16(in01, in14); 1.1204 + input2 = _mm_add_epi16(in02, in13); 1.1205 + input3 = _mm_add_epi16(in03, in12); 1.1206 + input4 = _mm_add_epi16(in04, in11); 1.1207 + input5 = _mm_add_epi16(in05, in10); 1.1208 + input6 = _mm_add_epi16(in06, in09); 1.1209 + input7 = _mm_add_epi16(in07, in08); 1.1210 + } 1.1211 + // Calculate input for the next 8 results. 1.1212 + { 1.1213 + step1_0 = _mm_sub_epi16(in07, in08); 1.1214 + step1_1 = _mm_sub_epi16(in06, in09); 1.1215 + step1_2 = _mm_sub_epi16(in05, in10); 1.1216 + step1_3 = _mm_sub_epi16(in04, in11); 1.1217 + step1_4 = _mm_sub_epi16(in03, in12); 1.1218 + step1_5 = _mm_sub_epi16(in02, in13); 1.1219 + step1_6 = _mm_sub_epi16(in01, in14); 1.1220 + step1_7 = _mm_sub_epi16(in00, in15); 1.1221 + } 1.1222 + // Work on the first eight values; fdct8_1d(input, even_results); 1.1223 + { 1.1224 + // Add/substract 1.1225 + const __m128i q0 = _mm_add_epi16(input0, input7); 1.1226 + const __m128i q1 = _mm_add_epi16(input1, input6); 1.1227 + const __m128i q2 = _mm_add_epi16(input2, input5); 1.1228 + const __m128i q3 = _mm_add_epi16(input3, input4); 1.1229 + const __m128i q4 = _mm_sub_epi16(input3, input4); 1.1230 + const __m128i q5 = _mm_sub_epi16(input2, input5); 1.1231 + const __m128i q6 = _mm_sub_epi16(input1, input6); 1.1232 + const __m128i q7 = _mm_sub_epi16(input0, input7); 1.1233 + // Work on first four results 1.1234 + { 1.1235 + // Add/substract 1.1236 + const __m128i r0 = _mm_add_epi16(q0, q3); 1.1237 + const __m128i r1 = _mm_add_epi16(q1, q2); 1.1238 + const __m128i r2 = _mm_sub_epi16(q1, q2); 1.1239 + const __m128i r3 = _mm_sub_epi16(q0, q3); 1.1240 + // Interleave to do the multiply by constants which gets us 1.1241 + // into 32 bits. 1.1242 + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); 1.1243 + const __m128i t1 = _mm_unpackhi_epi16(r0, r1); 1.1244 + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); 1.1245 + const __m128i t3 = _mm_unpackhi_epi16(r2, r3); 1.1246 + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 1.1247 + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 1.1248 + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); 1.1249 + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); 1.1250 + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); 1.1251 + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); 1.1252 + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); 1.1253 + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); 1.1254 + // dct_const_round_shift 1.1255 + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1.1256 + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1.1257 + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1.1258 + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1.1259 + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 1.1260 + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 1.1261 + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 1.1262 + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 1.1263 + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1.1264 + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1.1265 + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1.1266 + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1.1267 + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1.1268 + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1.1269 + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1.1270 + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1.1271 + // Combine 1.1272 + res00 = _mm_packs_epi32(w0, w1); 1.1273 + res08 = _mm_packs_epi32(w2, w3); 1.1274 + res04 = _mm_packs_epi32(w4, w5); 1.1275 + res12 = _mm_packs_epi32(w6, w7); 1.1276 + } 1.1277 + // Work on next four results 1.1278 + { 1.1279 + // Interleave to do the multiply by constants which gets us 1.1280 + // into 32 bits. 1.1281 + const __m128i d0 = _mm_unpacklo_epi16(q6, q5); 1.1282 + const __m128i d1 = _mm_unpackhi_epi16(q6, q5); 1.1283 + const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); 1.1284 + const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); 1.1285 + const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); 1.1286 + const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); 1.1287 + // dct_const_round_shift 1.1288 + const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); 1.1289 + const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); 1.1290 + const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); 1.1291 + const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); 1.1292 + const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); 1.1293 + const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); 1.1294 + const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); 1.1295 + const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); 1.1296 + // Combine 1.1297 + const __m128i r0 = _mm_packs_epi32(s0, s1); 1.1298 + const __m128i r1 = _mm_packs_epi32(s2, s3); 1.1299 + // Add/substract 1.1300 + const __m128i x0 = _mm_add_epi16(q4, r0); 1.1301 + const __m128i x1 = _mm_sub_epi16(q4, r0); 1.1302 + const __m128i x2 = _mm_sub_epi16(q7, r1); 1.1303 + const __m128i x3 = _mm_add_epi16(q7, r1); 1.1304 + // Interleave to do the multiply by constants which gets us 1.1305 + // into 32 bits. 1.1306 + const __m128i t0 = _mm_unpacklo_epi16(x0, x3); 1.1307 + const __m128i t1 = _mm_unpackhi_epi16(x0, x3); 1.1308 + const __m128i t2 = _mm_unpacklo_epi16(x1, x2); 1.1309 + const __m128i t3 = _mm_unpackhi_epi16(x1, x2); 1.1310 + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); 1.1311 + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); 1.1312 + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); 1.1313 + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); 1.1314 + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); 1.1315 + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); 1.1316 + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); 1.1317 + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); 1.1318 + // dct_const_round_shift 1.1319 + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1.1320 + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1.1321 + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1.1322 + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1.1323 + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); 1.1324 + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); 1.1325 + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); 1.1326 + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); 1.1327 + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1.1328 + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1.1329 + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1.1330 + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1.1331 + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1.1332 + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1.1333 + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1.1334 + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1.1335 + // Combine 1.1336 + res02 = _mm_packs_epi32(w0, w1); 1.1337 + res14 = _mm_packs_epi32(w2, w3); 1.1338 + res10 = _mm_packs_epi32(w4, w5); 1.1339 + res06 = _mm_packs_epi32(w6, w7); 1.1340 + } 1.1341 + } 1.1342 + // Work on the next eight values; step1 -> odd_results 1.1343 + { 1.1344 + // step 2 1.1345 + { 1.1346 + const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); 1.1347 + const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); 1.1348 + const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); 1.1349 + const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); 1.1350 + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16); 1.1351 + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16); 1.1352 + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16); 1.1353 + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16); 1.1354 + // dct_const_round_shift 1.1355 + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1.1356 + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1.1357 + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1.1358 + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1.1359 + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1.1360 + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1.1361 + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1.1362 + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1.1363 + // Combine 1.1364 + step2_2 = _mm_packs_epi32(w0, w1); 1.1365 + step2_3 = _mm_packs_epi32(w2, w3); 1.1366 + } 1.1367 + { 1.1368 + const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); 1.1369 + const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); 1.1370 + const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); 1.1371 + const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); 1.1372 + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); 1.1373 + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); 1.1374 + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16); 1.1375 + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16); 1.1376 + // dct_const_round_shift 1.1377 + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1.1378 + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1.1379 + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1.1380 + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1.1381 + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1.1382 + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1.1383 + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1.1384 + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1.1385 + // Combine 1.1386 + step2_5 = _mm_packs_epi32(w0, w1); 1.1387 + step2_4 = _mm_packs_epi32(w2, w3); 1.1388 + } 1.1389 + // step 3 1.1390 + { 1.1391 + step3_0 = _mm_add_epi16(step1_0, step2_3); 1.1392 + step3_1 = _mm_add_epi16(step1_1, step2_2); 1.1393 + step3_2 = _mm_sub_epi16(step1_1, step2_2); 1.1394 + step3_3 = _mm_sub_epi16(step1_0, step2_3); 1.1395 + step3_4 = _mm_sub_epi16(step1_7, step2_4); 1.1396 + step3_5 = _mm_sub_epi16(step1_6, step2_5); 1.1397 + step3_6 = _mm_add_epi16(step1_6, step2_5); 1.1398 + step3_7 = _mm_add_epi16(step1_7, step2_4); 1.1399 + } 1.1400 + // step 4 1.1401 + { 1.1402 + const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); 1.1403 + const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); 1.1404 + const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); 1.1405 + const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); 1.1406 + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); 1.1407 + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); 1.1408 + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); 1.1409 + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); 1.1410 + // dct_const_round_shift 1.1411 + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1.1412 + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1.1413 + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1.1414 + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1.1415 + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1.1416 + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1.1417 + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1.1418 + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1.1419 + // Combine 1.1420 + step2_1 = _mm_packs_epi32(w0, w1); 1.1421 + step2_2 = _mm_packs_epi32(w2, w3); 1.1422 + } 1.1423 + { 1.1424 + const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); 1.1425 + const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); 1.1426 + const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); 1.1427 + const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); 1.1428 + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); 1.1429 + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); 1.1430 + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); 1.1431 + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); 1.1432 + // dct_const_round_shift 1.1433 + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1.1434 + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1.1435 + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1.1436 + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1.1437 + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1.1438 + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1.1439 + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1.1440 + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1.1441 + // Combine 1.1442 + step2_6 = _mm_packs_epi32(w0, w1); 1.1443 + step2_5 = _mm_packs_epi32(w2, w3); 1.1444 + } 1.1445 + // step 5 1.1446 + { 1.1447 + step1_0 = _mm_add_epi16(step3_0, step2_1); 1.1448 + step1_1 = _mm_sub_epi16(step3_0, step2_1); 1.1449 + step1_2 = _mm_sub_epi16(step3_3, step2_2); 1.1450 + step1_3 = _mm_add_epi16(step3_3, step2_2); 1.1451 + step1_4 = _mm_add_epi16(step3_4, step2_5); 1.1452 + step1_5 = _mm_sub_epi16(step3_4, step2_5); 1.1453 + step1_6 = _mm_sub_epi16(step3_7, step2_6); 1.1454 + step1_7 = _mm_add_epi16(step3_7, step2_6); 1.1455 + } 1.1456 + // step 6 1.1457 + { 1.1458 + const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); 1.1459 + const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); 1.1460 + const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); 1.1461 + const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); 1.1462 + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02); 1.1463 + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02); 1.1464 + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18); 1.1465 + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18); 1.1466 + // dct_const_round_shift 1.1467 + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1.1468 + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1.1469 + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1.1470 + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1.1471 + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1.1472 + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1.1473 + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1.1474 + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1.1475 + // Combine 1.1476 + res01 = _mm_packs_epi32(w0, w1); 1.1477 + res09 = _mm_packs_epi32(w2, w3); 1.1478 + } 1.1479 + { 1.1480 + const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); 1.1481 + const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); 1.1482 + const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); 1.1483 + const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); 1.1484 + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10); 1.1485 + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10); 1.1486 + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26); 1.1487 + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26); 1.1488 + // dct_const_round_shift 1.1489 + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1.1490 + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1.1491 + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1.1492 + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1.1493 + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1.1494 + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1.1495 + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1.1496 + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1.1497 + // Combine 1.1498 + res05 = _mm_packs_epi32(w0, w1); 1.1499 + res13 = _mm_packs_epi32(w2, w3); 1.1500 + } 1.1501 + { 1.1502 + const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); 1.1503 + const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); 1.1504 + const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); 1.1505 + const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); 1.1506 + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22); 1.1507 + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22); 1.1508 + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06); 1.1509 + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06); 1.1510 + // dct_const_round_shift 1.1511 + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1.1512 + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1.1513 + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1.1514 + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1.1515 + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1.1516 + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1.1517 + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1.1518 + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1.1519 + // Combine 1.1520 + res11 = _mm_packs_epi32(w0, w1); 1.1521 + res03 = _mm_packs_epi32(w2, w3); 1.1522 + } 1.1523 + { 1.1524 + const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); 1.1525 + const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); 1.1526 + const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); 1.1527 + const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); 1.1528 + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30); 1.1529 + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30); 1.1530 + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14); 1.1531 + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14); 1.1532 + // dct_const_round_shift 1.1533 + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); 1.1534 + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); 1.1535 + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); 1.1536 + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); 1.1537 + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1.1538 + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1.1539 + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1.1540 + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1.1541 + // Combine 1.1542 + res15 = _mm_packs_epi32(w0, w1); 1.1543 + res07 = _mm_packs_epi32(w2, w3); 1.1544 + } 1.1545 + } 1.1546 + // Transpose the results, do it as two 8x8 transposes. 1.1547 + { 1.1548 + // 00 01 02 03 04 05 06 07 1.1549 + // 10 11 12 13 14 15 16 17 1.1550 + // 20 21 22 23 24 25 26 27 1.1551 + // 30 31 32 33 34 35 36 37 1.1552 + // 40 41 42 43 44 45 46 47 1.1553 + // 50 51 52 53 54 55 56 57 1.1554 + // 60 61 62 63 64 65 66 67 1.1555 + // 70 71 72 73 74 75 76 77 1.1556 + const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01); 1.1557 + const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03); 1.1558 + const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01); 1.1559 + const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03); 1.1560 + const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05); 1.1561 + const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07); 1.1562 + const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05); 1.1563 + const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07); 1.1564 + // 00 10 01 11 02 12 03 13 1.1565 + // 20 30 21 31 22 32 23 33 1.1566 + // 04 14 05 15 06 16 07 17 1.1567 + // 24 34 25 35 26 36 27 37 1.1568 + // 40 50 41 51 42 52 43 53 1.1569 + // 60 70 61 71 62 72 63 73 1.1570 + // 54 54 55 55 56 56 57 57 1.1571 + // 64 74 65 75 66 76 67 77 1.1572 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 1.1573 + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 1.1574 + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 1.1575 + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 1.1576 + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 1.1577 + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 1.1578 + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 1.1579 + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 1.1580 + // 00 10 20 30 01 11 21 31 1.1581 + // 40 50 60 70 41 51 61 71 1.1582 + // 02 12 22 32 03 13 23 33 1.1583 + // 42 52 62 72 43 53 63 73 1.1584 + // 04 14 24 34 05 15 21 36 1.1585 + // 44 54 64 74 45 55 61 76 1.1586 + // 06 16 26 36 07 17 27 37 1.1587 + // 46 56 66 76 47 57 67 77 1.1588 + const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 1.1589 + const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 1.1590 + const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 1.1591 + const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 1.1592 + const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 1.1593 + const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 1.1594 + const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 1.1595 + const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 1.1596 + // 00 10 20 30 40 50 60 70 1.1597 + // 01 11 21 31 41 51 61 71 1.1598 + // 02 12 22 32 42 52 62 72 1.1599 + // 03 13 23 33 43 53 63 73 1.1600 + // 04 14 24 34 44 54 64 74 1.1601 + // 05 15 25 35 45 55 65 75 1.1602 + // 06 16 26 36 46 56 66 76 1.1603 + // 07 17 27 37 47 57 67 77 1.1604 + _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0); 1.1605 + _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1); 1.1606 + _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2); 1.1607 + _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3); 1.1608 + _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4); 1.1609 + _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5); 1.1610 + _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6); 1.1611 + _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7); 1.1612 + } 1.1613 + { 1.1614 + // 00 01 02 03 04 05 06 07 1.1615 + // 10 11 12 13 14 15 16 17 1.1616 + // 20 21 22 23 24 25 26 27 1.1617 + // 30 31 32 33 34 35 36 37 1.1618 + // 40 41 42 43 44 45 46 47 1.1619 + // 50 51 52 53 54 55 56 57 1.1620 + // 60 61 62 63 64 65 66 67 1.1621 + // 70 71 72 73 74 75 76 77 1.1622 + const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09); 1.1623 + const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11); 1.1624 + const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09); 1.1625 + const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11); 1.1626 + const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13); 1.1627 + const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15); 1.1628 + const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13); 1.1629 + const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15); 1.1630 + // 00 10 01 11 02 12 03 13 1.1631 + // 20 30 21 31 22 32 23 33 1.1632 + // 04 14 05 15 06 16 07 17 1.1633 + // 24 34 25 35 26 36 27 37 1.1634 + // 40 50 41 51 42 52 43 53 1.1635 + // 60 70 61 71 62 72 63 73 1.1636 + // 54 54 55 55 56 56 57 57 1.1637 + // 64 74 65 75 66 76 67 77 1.1638 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 1.1639 + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 1.1640 + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 1.1641 + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 1.1642 + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 1.1643 + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 1.1644 + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 1.1645 + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 1.1646 + // 00 10 20 30 01 11 21 31 1.1647 + // 40 50 60 70 41 51 61 71 1.1648 + // 02 12 22 32 03 13 23 33 1.1649 + // 42 52 62 72 43 53 63 73 1.1650 + // 04 14 24 34 05 15 21 36 1.1651 + // 44 54 64 74 45 55 61 76 1.1652 + // 06 16 26 36 07 17 27 37 1.1653 + // 46 56 66 76 47 57 67 77 1.1654 + const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 1.1655 + const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 1.1656 + const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 1.1657 + const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 1.1658 + const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 1.1659 + const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 1.1660 + const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 1.1661 + const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 1.1662 + // 00 10 20 30 40 50 60 70 1.1663 + // 01 11 21 31 41 51 61 71 1.1664 + // 02 12 22 32 42 52 62 72 1.1665 + // 03 13 23 33 43 53 63 73 1.1666 + // 04 14 24 34 44 54 64 74 1.1667 + // 05 15 25 35 45 55 65 75 1.1668 + // 06 16 26 36 46 56 66 76 1.1669 + // 07 17 27 37 47 57 67 77 1.1670 + // Store results 1.1671 + _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0); 1.1672 + _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1); 1.1673 + _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2); 1.1674 + _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3); 1.1675 + _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4); 1.1676 + _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5); 1.1677 + _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6); 1.1678 + _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); 1.1679 + } 1.1680 + out += 8*16; 1.1681 + } 1.1682 + // Setup in/out for next pass. 1.1683 + in = intermediate; 1.1684 + out = output; 1.1685 + } 1.1686 +} 1.1687 + 1.1688 +static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0, 1.1689 + __m128i *in1, int stride) { 1.1690 + // load first 8 columns 1.1691 + load_buffer_8x8(input, in0, stride); 1.1692 + load_buffer_8x8(input + 8 * stride, in0 + 8, stride); 1.1693 + 1.1694 + input += 8; 1.1695 + // load second 8 columns 1.1696 + load_buffer_8x8(input, in1, stride); 1.1697 + load_buffer_8x8(input + 8 * stride, in1 + 8, stride); 1.1698 +} 1.1699 + 1.1700 +static INLINE void write_buffer_16x16(int16_t *output, __m128i *in0, 1.1701 + __m128i *in1, int stride) { 1.1702 + // write first 8 columns 1.1703 + write_buffer_8x8(output, in0, stride); 1.1704 + write_buffer_8x8(output + 8 * stride, in0 + 8, stride); 1.1705 + // write second 8 columns 1.1706 + output += 8; 1.1707 + write_buffer_8x8(output, in1, stride); 1.1708 + write_buffer_8x8(output + 8 * stride, in1 + 8, stride); 1.1709 +} 1.1710 + 1.1711 +static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { 1.1712 + __m128i tbuf[8]; 1.1713 + array_transpose_8x8(res0, res0); 1.1714 + array_transpose_8x8(res1, tbuf); 1.1715 + array_transpose_8x8(res0 + 8, res1); 1.1716 + array_transpose_8x8(res1 + 8, res1 + 8); 1.1717 + 1.1718 + res0[8] = tbuf[0]; 1.1719 + res0[9] = tbuf[1]; 1.1720 + res0[10] = tbuf[2]; 1.1721 + res0[11] = tbuf[3]; 1.1722 + res0[12] = tbuf[4]; 1.1723 + res0[13] = tbuf[5]; 1.1724 + res0[14] = tbuf[6]; 1.1725 + res0[15] = tbuf[7]; 1.1726 +} 1.1727 + 1.1728 +static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) { 1.1729 + // perform rounding operations 1.1730 + right_shift_8x8(res0, 2); 1.1731 + right_shift_8x8(res0 + 8, 2); 1.1732 + right_shift_8x8(res1, 2); 1.1733 + right_shift_8x8(res1 + 8, 2); 1.1734 +} 1.1735 + 1.1736 +void fdct16_1d_8col(__m128i *in) { 1.1737 + // perform 16x16 1-D DCT for 8 columns 1.1738 + __m128i i[8], s[8], p[8], t[8], u[16], v[16]; 1.1739 + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1.1740 + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.1741 + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1.1742 + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1.1743 + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1.1744 + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1.1745 + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); 1.1746 + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1.1747 + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); 1.1748 + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1.1749 + const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); 1.1750 + const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); 1.1751 + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); 1.1752 + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); 1.1753 + const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); 1.1754 + const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); 1.1755 + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); 1.1756 + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); 1.1757 + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.1758 + 1.1759 + // stage 1 1.1760 + i[0] = _mm_add_epi16(in[0], in[15]); 1.1761 + i[1] = _mm_add_epi16(in[1], in[14]); 1.1762 + i[2] = _mm_add_epi16(in[2], in[13]); 1.1763 + i[3] = _mm_add_epi16(in[3], in[12]); 1.1764 + i[4] = _mm_add_epi16(in[4], in[11]); 1.1765 + i[5] = _mm_add_epi16(in[5], in[10]); 1.1766 + i[6] = _mm_add_epi16(in[6], in[9]); 1.1767 + i[7] = _mm_add_epi16(in[7], in[8]); 1.1768 + 1.1769 + s[0] = _mm_sub_epi16(in[7], in[8]); 1.1770 + s[1] = _mm_sub_epi16(in[6], in[9]); 1.1771 + s[2] = _mm_sub_epi16(in[5], in[10]); 1.1772 + s[3] = _mm_sub_epi16(in[4], in[11]); 1.1773 + s[4] = _mm_sub_epi16(in[3], in[12]); 1.1774 + s[5] = _mm_sub_epi16(in[2], in[13]); 1.1775 + s[6] = _mm_sub_epi16(in[1], in[14]); 1.1776 + s[7] = _mm_sub_epi16(in[0], in[15]); 1.1777 + 1.1778 + p[0] = _mm_add_epi16(i[0], i[7]); 1.1779 + p[1] = _mm_add_epi16(i[1], i[6]); 1.1780 + p[2] = _mm_add_epi16(i[2], i[5]); 1.1781 + p[3] = _mm_add_epi16(i[3], i[4]); 1.1782 + p[4] = _mm_sub_epi16(i[3], i[4]); 1.1783 + p[5] = _mm_sub_epi16(i[2], i[5]); 1.1784 + p[6] = _mm_sub_epi16(i[1], i[6]); 1.1785 + p[7] = _mm_sub_epi16(i[0], i[7]); 1.1786 + 1.1787 + u[0] = _mm_add_epi16(p[0], p[3]); 1.1788 + u[1] = _mm_add_epi16(p[1], p[2]); 1.1789 + u[2] = _mm_sub_epi16(p[1], p[2]); 1.1790 + u[3] = _mm_sub_epi16(p[0], p[3]); 1.1791 + 1.1792 + v[0] = _mm_unpacklo_epi16(u[0], u[1]); 1.1793 + v[1] = _mm_unpackhi_epi16(u[0], u[1]); 1.1794 + v[2] = _mm_unpacklo_epi16(u[2], u[3]); 1.1795 + v[3] = _mm_unpackhi_epi16(u[2], u[3]); 1.1796 + 1.1797 + u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); 1.1798 + u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16); 1.1799 + u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16); 1.1800 + u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16); 1.1801 + u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08); 1.1802 + u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08); 1.1803 + u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24); 1.1804 + u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24); 1.1805 + 1.1806 + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1.1807 + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1.1808 + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1.1809 + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1.1810 + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1.1811 + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1.1812 + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1.1813 + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1.1814 + 1.1815 + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1.1816 + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1.1817 + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1.1818 + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1.1819 + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1.1820 + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1.1821 + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1.1822 + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1.1823 + 1.1824 + in[0] = _mm_packs_epi32(u[0], u[1]); 1.1825 + in[4] = _mm_packs_epi32(u[4], u[5]); 1.1826 + in[8] = _mm_packs_epi32(u[2], u[3]); 1.1827 + in[12] = _mm_packs_epi32(u[6], u[7]); 1.1828 + 1.1829 + u[0] = _mm_unpacklo_epi16(p[5], p[6]); 1.1830 + u[1] = _mm_unpackhi_epi16(p[5], p[6]); 1.1831 + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 1.1832 + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 1.1833 + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1.1834 + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1.1835 + 1.1836 + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1.1837 + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1.1838 + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1.1839 + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1.1840 + 1.1841 + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1.1842 + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1.1843 + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1.1844 + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1.1845 + 1.1846 + u[0] = _mm_packs_epi32(v[0], v[1]); 1.1847 + u[1] = _mm_packs_epi32(v[2], v[3]); 1.1848 + 1.1849 + t[0] = _mm_add_epi16(p[4], u[0]); 1.1850 + t[1] = _mm_sub_epi16(p[4], u[0]); 1.1851 + t[2] = _mm_sub_epi16(p[7], u[1]); 1.1852 + t[3] = _mm_add_epi16(p[7], u[1]); 1.1853 + 1.1854 + u[0] = _mm_unpacklo_epi16(t[0], t[3]); 1.1855 + u[1] = _mm_unpackhi_epi16(t[0], t[3]); 1.1856 + u[2] = _mm_unpacklo_epi16(t[1], t[2]); 1.1857 + u[3] = _mm_unpackhi_epi16(t[1], t[2]); 1.1858 + 1.1859 + v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04); 1.1860 + v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04); 1.1861 + v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20); 1.1862 + v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20); 1.1863 + v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12); 1.1864 + v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12); 1.1865 + v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28); 1.1866 + v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28); 1.1867 + 1.1868 + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1.1869 + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1.1870 + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1.1871 + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1.1872 + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1.1873 + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1.1874 + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1.1875 + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1.1876 + 1.1877 + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1.1878 + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1.1879 + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1.1880 + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1.1881 + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1.1882 + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1.1883 + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1.1884 + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1.1885 + 1.1886 + in[2] = _mm_packs_epi32(v[0], v[1]); 1.1887 + in[6] = _mm_packs_epi32(v[4], v[5]); 1.1888 + in[10] = _mm_packs_epi32(v[2], v[3]); 1.1889 + in[14] = _mm_packs_epi32(v[6], v[7]); 1.1890 + 1.1891 + // stage 2 1.1892 + u[0] = _mm_unpacklo_epi16(s[2], s[5]); 1.1893 + u[1] = _mm_unpackhi_epi16(s[2], s[5]); 1.1894 + u[2] = _mm_unpacklo_epi16(s[3], s[4]); 1.1895 + u[3] = _mm_unpackhi_epi16(s[3], s[4]); 1.1896 + 1.1897 + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 1.1898 + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 1.1899 + v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 1.1900 + v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 1.1901 + v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1.1902 + v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 1.1903 + v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1.1904 + v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1.1905 + 1.1906 + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1.1907 + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1.1908 + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1.1909 + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1.1910 + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1.1911 + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1.1912 + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1.1913 + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1.1914 + 1.1915 + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1.1916 + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1.1917 + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1.1918 + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1.1919 + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1.1920 + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1.1921 + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1.1922 + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1.1923 + 1.1924 + t[2] = _mm_packs_epi32(v[0], v[1]); 1.1925 + t[3] = _mm_packs_epi32(v[2], v[3]); 1.1926 + t[4] = _mm_packs_epi32(v[4], v[5]); 1.1927 + t[5] = _mm_packs_epi32(v[6], v[7]); 1.1928 + 1.1929 + // stage 3 1.1930 + p[0] = _mm_add_epi16(s[0], t[3]); 1.1931 + p[1] = _mm_add_epi16(s[1], t[2]); 1.1932 + p[2] = _mm_sub_epi16(s[1], t[2]); 1.1933 + p[3] = _mm_sub_epi16(s[0], t[3]); 1.1934 + p[4] = _mm_sub_epi16(s[7], t[4]); 1.1935 + p[5] = _mm_sub_epi16(s[6], t[5]); 1.1936 + p[6] = _mm_add_epi16(s[6], t[5]); 1.1937 + p[7] = _mm_add_epi16(s[7], t[4]); 1.1938 + 1.1939 + // stage 4 1.1940 + u[0] = _mm_unpacklo_epi16(p[1], p[6]); 1.1941 + u[1] = _mm_unpackhi_epi16(p[1], p[6]); 1.1942 + u[2] = _mm_unpacklo_epi16(p[2], p[5]); 1.1943 + u[3] = _mm_unpackhi_epi16(p[2], p[5]); 1.1944 + 1.1945 + v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); 1.1946 + v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); 1.1947 + v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08); 1.1948 + v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08); 1.1949 + v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24); 1.1950 + v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24); 1.1951 + v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); 1.1952 + v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); 1.1953 + 1.1954 + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1.1955 + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1.1956 + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1.1957 + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1.1958 + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1.1959 + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1.1960 + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1.1961 + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1.1962 + 1.1963 + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1.1964 + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1.1965 + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1.1966 + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1.1967 + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1.1968 + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1.1969 + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1.1970 + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1.1971 + 1.1972 + t[1] = _mm_packs_epi32(v[0], v[1]); 1.1973 + t[2] = _mm_packs_epi32(v[2], v[3]); 1.1974 + t[5] = _mm_packs_epi32(v[4], v[5]); 1.1975 + t[6] = _mm_packs_epi32(v[6], v[7]); 1.1976 + 1.1977 + // stage 5 1.1978 + s[0] = _mm_add_epi16(p[0], t[1]); 1.1979 + s[1] = _mm_sub_epi16(p[0], t[1]); 1.1980 + s[2] = _mm_sub_epi16(p[3], t[2]); 1.1981 + s[3] = _mm_add_epi16(p[3], t[2]); 1.1982 + s[4] = _mm_add_epi16(p[4], t[5]); 1.1983 + s[5] = _mm_sub_epi16(p[4], t[5]); 1.1984 + s[6] = _mm_sub_epi16(p[7], t[6]); 1.1985 + s[7] = _mm_add_epi16(p[7], t[6]); 1.1986 + 1.1987 + // stage 6 1.1988 + u[0] = _mm_unpacklo_epi16(s[0], s[7]); 1.1989 + u[1] = _mm_unpackhi_epi16(s[0], s[7]); 1.1990 + u[2] = _mm_unpacklo_epi16(s[1], s[6]); 1.1991 + u[3] = _mm_unpackhi_epi16(s[1], s[6]); 1.1992 + u[4] = _mm_unpacklo_epi16(s[2], s[5]); 1.1993 + u[5] = _mm_unpackhi_epi16(s[2], s[5]); 1.1994 + u[6] = _mm_unpacklo_epi16(s[3], s[4]); 1.1995 + u[7] = _mm_unpackhi_epi16(s[3], s[4]); 1.1996 + 1.1997 + v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02); 1.1998 + v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02); 1.1999 + v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18); 1.2000 + v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18); 1.2001 + v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10); 1.2002 + v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10); 1.2003 + v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26); 1.2004 + v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26); 1.2005 + v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06); 1.2006 + v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06); 1.2007 + v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22); 1.2008 + v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22); 1.2009 + v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14); 1.2010 + v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14); 1.2011 + v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30); 1.2012 + v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30); 1.2013 + 1.2014 + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1.2015 + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1.2016 + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1.2017 + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1.2018 + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1.2019 + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1.2020 + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1.2021 + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1.2022 + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1.2023 + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1.2024 + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1.2025 + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1.2026 + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1.2027 + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1.2028 + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1.2029 + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1.2030 + 1.2031 + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1.2032 + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1.2033 + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1.2034 + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1.2035 + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1.2036 + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1.2037 + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1.2038 + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1.2039 + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1.2040 + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1.2041 + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1.2042 + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1.2043 + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1.2044 + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1.2045 + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1.2046 + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1.2047 + 1.2048 + in[1] = _mm_packs_epi32(v[0], v[1]); 1.2049 + in[9] = _mm_packs_epi32(v[2], v[3]); 1.2050 + in[5] = _mm_packs_epi32(v[4], v[5]); 1.2051 + in[13] = _mm_packs_epi32(v[6], v[7]); 1.2052 + in[3] = _mm_packs_epi32(v[8], v[9]); 1.2053 + in[11] = _mm_packs_epi32(v[10], v[11]); 1.2054 + in[7] = _mm_packs_epi32(v[12], v[13]); 1.2055 + in[15] = _mm_packs_epi32(v[14], v[15]); 1.2056 +} 1.2057 + 1.2058 +void fadst16_1d_8col(__m128i *in) { 1.2059 + // perform 16x16 1-D ADST for 8 columns 1.2060 + __m128i s[16], x[16], u[32], v[32]; 1.2061 + const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 1.2062 + const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 1.2063 + const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 1.2064 + const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 1.2065 + const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 1.2066 + const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 1.2067 + const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 1.2068 + const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 1.2069 + const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 1.2070 + const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 1.2071 + const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 1.2072 + const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 1.2073 + const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 1.2074 + const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 1.2075 + const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 1.2076 + const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 1.2077 + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1.2078 + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1.2079 + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1.2080 + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1.2081 + const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 1.2082 + const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 1.2083 + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1.2084 + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1.2085 + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 1.2086 + const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); 1.2087 + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1.2088 + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.2089 + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1.2090 + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.2091 + const __m128i kZero = _mm_set1_epi16(0); 1.2092 + 1.2093 + u[0] = _mm_unpacklo_epi16(in[15], in[0]); 1.2094 + u[1] = _mm_unpackhi_epi16(in[15], in[0]); 1.2095 + u[2] = _mm_unpacklo_epi16(in[13], in[2]); 1.2096 + u[3] = _mm_unpackhi_epi16(in[13], in[2]); 1.2097 + u[4] = _mm_unpacklo_epi16(in[11], in[4]); 1.2098 + u[5] = _mm_unpackhi_epi16(in[11], in[4]); 1.2099 + u[6] = _mm_unpacklo_epi16(in[9], in[6]); 1.2100 + u[7] = _mm_unpackhi_epi16(in[9], in[6]); 1.2101 + u[8] = _mm_unpacklo_epi16(in[7], in[8]); 1.2102 + u[9] = _mm_unpackhi_epi16(in[7], in[8]); 1.2103 + u[10] = _mm_unpacklo_epi16(in[5], in[10]); 1.2104 + u[11] = _mm_unpackhi_epi16(in[5], in[10]); 1.2105 + u[12] = _mm_unpacklo_epi16(in[3], in[12]); 1.2106 + u[13] = _mm_unpackhi_epi16(in[3], in[12]); 1.2107 + u[14] = _mm_unpacklo_epi16(in[1], in[14]); 1.2108 + u[15] = _mm_unpackhi_epi16(in[1], in[14]); 1.2109 + 1.2110 + v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 1.2111 + v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 1.2112 + v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 1.2113 + v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 1.2114 + v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 1.2115 + v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 1.2116 + v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 1.2117 + v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 1.2118 + v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 1.2119 + v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 1.2120 + v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 1.2121 + v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 1.2122 + v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 1.2123 + v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 1.2124 + v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 1.2125 + v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 1.2126 + v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 1.2127 + v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 1.2128 + v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 1.2129 + v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 1.2130 + v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 1.2131 + v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 1.2132 + v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 1.2133 + v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 1.2134 + v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 1.2135 + v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 1.2136 + v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 1.2137 + v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 1.2138 + v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 1.2139 + v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 1.2140 + v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 1.2141 + v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 1.2142 + 1.2143 + u[0] = _mm_add_epi32(v[0], v[16]); 1.2144 + u[1] = _mm_add_epi32(v[1], v[17]); 1.2145 + u[2] = _mm_add_epi32(v[2], v[18]); 1.2146 + u[3] = _mm_add_epi32(v[3], v[19]); 1.2147 + u[4] = _mm_add_epi32(v[4], v[20]); 1.2148 + u[5] = _mm_add_epi32(v[5], v[21]); 1.2149 + u[6] = _mm_add_epi32(v[6], v[22]); 1.2150 + u[7] = _mm_add_epi32(v[7], v[23]); 1.2151 + u[8] = _mm_add_epi32(v[8], v[24]); 1.2152 + u[9] = _mm_add_epi32(v[9], v[25]); 1.2153 + u[10] = _mm_add_epi32(v[10], v[26]); 1.2154 + u[11] = _mm_add_epi32(v[11], v[27]); 1.2155 + u[12] = _mm_add_epi32(v[12], v[28]); 1.2156 + u[13] = _mm_add_epi32(v[13], v[29]); 1.2157 + u[14] = _mm_add_epi32(v[14], v[30]); 1.2158 + u[15] = _mm_add_epi32(v[15], v[31]); 1.2159 + u[16] = _mm_sub_epi32(v[0], v[16]); 1.2160 + u[17] = _mm_sub_epi32(v[1], v[17]); 1.2161 + u[18] = _mm_sub_epi32(v[2], v[18]); 1.2162 + u[19] = _mm_sub_epi32(v[3], v[19]); 1.2163 + u[20] = _mm_sub_epi32(v[4], v[20]); 1.2164 + u[21] = _mm_sub_epi32(v[5], v[21]); 1.2165 + u[22] = _mm_sub_epi32(v[6], v[22]); 1.2166 + u[23] = _mm_sub_epi32(v[7], v[23]); 1.2167 + u[24] = _mm_sub_epi32(v[8], v[24]); 1.2168 + u[25] = _mm_sub_epi32(v[9], v[25]); 1.2169 + u[26] = _mm_sub_epi32(v[10], v[26]); 1.2170 + u[27] = _mm_sub_epi32(v[11], v[27]); 1.2171 + u[28] = _mm_sub_epi32(v[12], v[28]); 1.2172 + u[29] = _mm_sub_epi32(v[13], v[29]); 1.2173 + u[30] = _mm_sub_epi32(v[14], v[30]); 1.2174 + u[31] = _mm_sub_epi32(v[15], v[31]); 1.2175 + 1.2176 + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1.2177 + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1.2178 + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1.2179 + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1.2180 + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1.2181 + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1.2182 + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1.2183 + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1.2184 + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1.2185 + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1.2186 + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1.2187 + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1.2188 + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1.2189 + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1.2190 + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1.2191 + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1.2192 + v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 1.2193 + v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 1.2194 + v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 1.2195 + v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 1.2196 + v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 1.2197 + v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 1.2198 + v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 1.2199 + v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 1.2200 + v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 1.2201 + v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 1.2202 + v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 1.2203 + v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 1.2204 + v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 1.2205 + v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 1.2206 + v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 1.2207 + v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 1.2208 + 1.2209 + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1.2210 + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1.2211 + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1.2212 + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1.2213 + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1.2214 + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1.2215 + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1.2216 + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1.2217 + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1.2218 + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1.2219 + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1.2220 + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1.2221 + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1.2222 + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1.2223 + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1.2224 + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1.2225 + u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 1.2226 + u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 1.2227 + u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 1.2228 + u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 1.2229 + u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 1.2230 + u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 1.2231 + u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 1.2232 + u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 1.2233 + u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 1.2234 + u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 1.2235 + u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 1.2236 + u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 1.2237 + u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 1.2238 + u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 1.2239 + u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 1.2240 + u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 1.2241 + 1.2242 + s[0] = _mm_packs_epi32(u[0], u[1]); 1.2243 + s[1] = _mm_packs_epi32(u[2], u[3]); 1.2244 + s[2] = _mm_packs_epi32(u[4], u[5]); 1.2245 + s[3] = _mm_packs_epi32(u[6], u[7]); 1.2246 + s[4] = _mm_packs_epi32(u[8], u[9]); 1.2247 + s[5] = _mm_packs_epi32(u[10], u[11]); 1.2248 + s[6] = _mm_packs_epi32(u[12], u[13]); 1.2249 + s[7] = _mm_packs_epi32(u[14], u[15]); 1.2250 + s[8] = _mm_packs_epi32(u[16], u[17]); 1.2251 + s[9] = _mm_packs_epi32(u[18], u[19]); 1.2252 + s[10] = _mm_packs_epi32(u[20], u[21]); 1.2253 + s[11] = _mm_packs_epi32(u[22], u[23]); 1.2254 + s[12] = _mm_packs_epi32(u[24], u[25]); 1.2255 + s[13] = _mm_packs_epi32(u[26], u[27]); 1.2256 + s[14] = _mm_packs_epi32(u[28], u[29]); 1.2257 + s[15] = _mm_packs_epi32(u[30], u[31]); 1.2258 + 1.2259 + // stage 2 1.2260 + u[0] = _mm_unpacklo_epi16(s[8], s[9]); 1.2261 + u[1] = _mm_unpackhi_epi16(s[8], s[9]); 1.2262 + u[2] = _mm_unpacklo_epi16(s[10], s[11]); 1.2263 + u[3] = _mm_unpackhi_epi16(s[10], s[11]); 1.2264 + u[4] = _mm_unpacklo_epi16(s[12], s[13]); 1.2265 + u[5] = _mm_unpackhi_epi16(s[12], s[13]); 1.2266 + u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1.2267 + u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1.2268 + 1.2269 + v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 1.2270 + v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 1.2271 + v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 1.2272 + v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 1.2273 + v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 1.2274 + v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 1.2275 + v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 1.2276 + v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 1.2277 + v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 1.2278 + v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 1.2279 + v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 1.2280 + v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 1.2281 + v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 1.2282 + v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 1.2283 + v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 1.2284 + v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 1.2285 + 1.2286 + u[0] = _mm_add_epi32(v[0], v[8]); 1.2287 + u[1] = _mm_add_epi32(v[1], v[9]); 1.2288 + u[2] = _mm_add_epi32(v[2], v[10]); 1.2289 + u[3] = _mm_add_epi32(v[3], v[11]); 1.2290 + u[4] = _mm_add_epi32(v[4], v[12]); 1.2291 + u[5] = _mm_add_epi32(v[5], v[13]); 1.2292 + u[6] = _mm_add_epi32(v[6], v[14]); 1.2293 + u[7] = _mm_add_epi32(v[7], v[15]); 1.2294 + u[8] = _mm_sub_epi32(v[0], v[8]); 1.2295 + u[9] = _mm_sub_epi32(v[1], v[9]); 1.2296 + u[10] = _mm_sub_epi32(v[2], v[10]); 1.2297 + u[11] = _mm_sub_epi32(v[3], v[11]); 1.2298 + u[12] = _mm_sub_epi32(v[4], v[12]); 1.2299 + u[13] = _mm_sub_epi32(v[5], v[13]); 1.2300 + u[14] = _mm_sub_epi32(v[6], v[14]); 1.2301 + u[15] = _mm_sub_epi32(v[7], v[15]); 1.2302 + 1.2303 + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1.2304 + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1.2305 + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1.2306 + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1.2307 + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1.2308 + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1.2309 + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1.2310 + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1.2311 + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1.2312 + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1.2313 + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1.2314 + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1.2315 + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1.2316 + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1.2317 + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1.2318 + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1.2319 + 1.2320 + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1.2321 + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1.2322 + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1.2323 + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1.2324 + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1.2325 + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1.2326 + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1.2327 + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1.2328 + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1.2329 + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1.2330 + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1.2331 + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1.2332 + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1.2333 + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1.2334 + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1.2335 + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1.2336 + 1.2337 + x[0] = _mm_add_epi16(s[0], s[4]); 1.2338 + x[1] = _mm_add_epi16(s[1], s[5]); 1.2339 + x[2] = _mm_add_epi16(s[2], s[6]); 1.2340 + x[3] = _mm_add_epi16(s[3], s[7]); 1.2341 + x[4] = _mm_sub_epi16(s[0], s[4]); 1.2342 + x[5] = _mm_sub_epi16(s[1], s[5]); 1.2343 + x[6] = _mm_sub_epi16(s[2], s[6]); 1.2344 + x[7] = _mm_sub_epi16(s[3], s[7]); 1.2345 + x[8] = _mm_packs_epi32(u[0], u[1]); 1.2346 + x[9] = _mm_packs_epi32(u[2], u[3]); 1.2347 + x[10] = _mm_packs_epi32(u[4], u[5]); 1.2348 + x[11] = _mm_packs_epi32(u[6], u[7]); 1.2349 + x[12] = _mm_packs_epi32(u[8], u[9]); 1.2350 + x[13] = _mm_packs_epi32(u[10], u[11]); 1.2351 + x[14] = _mm_packs_epi32(u[12], u[13]); 1.2352 + x[15] = _mm_packs_epi32(u[14], u[15]); 1.2353 + 1.2354 + // stage 3 1.2355 + u[0] = _mm_unpacklo_epi16(x[4], x[5]); 1.2356 + u[1] = _mm_unpackhi_epi16(x[4], x[5]); 1.2357 + u[2] = _mm_unpacklo_epi16(x[6], x[7]); 1.2358 + u[3] = _mm_unpackhi_epi16(x[6], x[7]); 1.2359 + u[4] = _mm_unpacklo_epi16(x[12], x[13]); 1.2360 + u[5] = _mm_unpackhi_epi16(x[12], x[13]); 1.2361 + u[6] = _mm_unpacklo_epi16(x[14], x[15]); 1.2362 + u[7] = _mm_unpackhi_epi16(x[14], x[15]); 1.2363 + 1.2364 + v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 1.2365 + v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 1.2366 + v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 1.2367 + v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 1.2368 + v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 1.2369 + v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 1.2370 + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 1.2371 + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 1.2372 + v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 1.2373 + v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 1.2374 + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 1.2375 + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 1.2376 + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 1.2377 + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 1.2378 + v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 1.2379 + v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 1.2380 + 1.2381 + u[0] = _mm_add_epi32(v[0], v[4]); 1.2382 + u[1] = _mm_add_epi32(v[1], v[5]); 1.2383 + u[2] = _mm_add_epi32(v[2], v[6]); 1.2384 + u[3] = _mm_add_epi32(v[3], v[7]); 1.2385 + u[4] = _mm_sub_epi32(v[0], v[4]); 1.2386 + u[5] = _mm_sub_epi32(v[1], v[5]); 1.2387 + u[6] = _mm_sub_epi32(v[2], v[6]); 1.2388 + u[7] = _mm_sub_epi32(v[3], v[7]); 1.2389 + u[8] = _mm_add_epi32(v[8], v[12]); 1.2390 + u[9] = _mm_add_epi32(v[9], v[13]); 1.2391 + u[10] = _mm_add_epi32(v[10], v[14]); 1.2392 + u[11] = _mm_add_epi32(v[11], v[15]); 1.2393 + u[12] = _mm_sub_epi32(v[8], v[12]); 1.2394 + u[13] = _mm_sub_epi32(v[9], v[13]); 1.2395 + u[14] = _mm_sub_epi32(v[10], v[14]); 1.2396 + u[15] = _mm_sub_epi32(v[11], v[15]); 1.2397 + 1.2398 + u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1.2399 + u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1.2400 + u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1.2401 + u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1.2402 + u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1.2403 + u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1.2404 + u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1.2405 + u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1.2406 + u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1.2407 + u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1.2408 + u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1.2409 + u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1.2410 + u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1.2411 + u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1.2412 + u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1.2413 + u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1.2414 + 1.2415 + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1.2416 + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1.2417 + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1.2418 + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1.2419 + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1.2420 + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1.2421 + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1.2422 + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1.2423 + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1.2424 + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1.2425 + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1.2426 + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1.2427 + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1.2428 + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1.2429 + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1.2430 + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1.2431 + 1.2432 + s[0] = _mm_add_epi16(x[0], x[2]); 1.2433 + s[1] = _mm_add_epi16(x[1], x[3]); 1.2434 + s[2] = _mm_sub_epi16(x[0], x[2]); 1.2435 + s[3] = _mm_sub_epi16(x[1], x[3]); 1.2436 + s[4] = _mm_packs_epi32(v[0], v[1]); 1.2437 + s[5] = _mm_packs_epi32(v[2], v[3]); 1.2438 + s[6] = _mm_packs_epi32(v[4], v[5]); 1.2439 + s[7] = _mm_packs_epi32(v[6], v[7]); 1.2440 + s[8] = _mm_add_epi16(x[8], x[10]); 1.2441 + s[9] = _mm_add_epi16(x[9], x[11]); 1.2442 + s[10] = _mm_sub_epi16(x[8], x[10]); 1.2443 + s[11] = _mm_sub_epi16(x[9], x[11]); 1.2444 + s[12] = _mm_packs_epi32(v[8], v[9]); 1.2445 + s[13] = _mm_packs_epi32(v[10], v[11]); 1.2446 + s[14] = _mm_packs_epi32(v[12], v[13]); 1.2447 + s[15] = _mm_packs_epi32(v[14], v[15]); 1.2448 + 1.2449 + // stage 4 1.2450 + u[0] = _mm_unpacklo_epi16(s[2], s[3]); 1.2451 + u[1] = _mm_unpackhi_epi16(s[2], s[3]); 1.2452 + u[2] = _mm_unpacklo_epi16(s[6], s[7]); 1.2453 + u[3] = _mm_unpackhi_epi16(s[6], s[7]); 1.2454 + u[4] = _mm_unpacklo_epi16(s[10], s[11]); 1.2455 + u[5] = _mm_unpackhi_epi16(s[10], s[11]); 1.2456 + u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1.2457 + u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1.2458 + 1.2459 + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); 1.2460 + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); 1.2461 + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 1.2462 + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 1.2463 + v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1.2464 + v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 1.2465 + v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 1.2466 + v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 1.2467 + v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); 1.2468 + v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); 1.2469 + v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); 1.2470 + v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); 1.2471 + v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); 1.2472 + v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); 1.2473 + v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); 1.2474 + v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); 1.2475 + 1.2476 + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1.2477 + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1.2478 + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1.2479 + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1.2480 + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1.2481 + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1.2482 + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1.2483 + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1.2484 + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1.2485 + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1.2486 + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1.2487 + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1.2488 + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1.2489 + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1.2490 + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1.2491 + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1.2492 + 1.2493 + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1.2494 + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1.2495 + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1.2496 + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1.2497 + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1.2498 + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1.2499 + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1.2500 + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1.2501 + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1.2502 + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1.2503 + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1.2504 + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1.2505 + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1.2506 + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1.2507 + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1.2508 + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1.2509 + 1.2510 + in[0] = s[0]; 1.2511 + in[1] = _mm_sub_epi16(kZero, s[8]); 1.2512 + in[2] = s[12]; 1.2513 + in[3] = _mm_sub_epi16(kZero, s[4]); 1.2514 + in[4] = _mm_packs_epi32(v[4], v[5]); 1.2515 + in[5] = _mm_packs_epi32(v[12], v[13]); 1.2516 + in[6] = _mm_packs_epi32(v[8], v[9]); 1.2517 + in[7] = _mm_packs_epi32(v[0], v[1]); 1.2518 + in[8] = _mm_packs_epi32(v[2], v[3]); 1.2519 + in[9] = _mm_packs_epi32(v[10], v[11]); 1.2520 + in[10] = _mm_packs_epi32(v[14], v[15]); 1.2521 + in[11] = _mm_packs_epi32(v[6], v[7]); 1.2522 + in[12] = s[5]; 1.2523 + in[13] = _mm_sub_epi16(kZero, s[13]); 1.2524 + in[14] = s[9]; 1.2525 + in[15] = _mm_sub_epi16(kZero, s[1]); 1.2526 +} 1.2527 + 1.2528 +void fdct16_1d_sse2(__m128i *in0, __m128i *in1) { 1.2529 + fdct16_1d_8col(in0); 1.2530 + fdct16_1d_8col(in1); 1.2531 + array_transpose_16x16(in0, in1); 1.2532 +} 1.2533 + 1.2534 +void fadst16_1d_sse2(__m128i *in0, __m128i *in1) { 1.2535 + fadst16_1d_8col(in0); 1.2536 + fadst16_1d_8col(in1); 1.2537 + array_transpose_16x16(in0, in1); 1.2538 +} 1.2539 + 1.2540 +void vp9_short_fht16x16_sse2(const int16_t *input, int16_t *output, 1.2541 + int stride, int tx_type) { 1.2542 + __m128i in0[16], in1[16]; 1.2543 + load_buffer_16x16(input, in0, in1, stride); 1.2544 + switch (tx_type) { 1.2545 + case 0: // DCT_DCT 1.2546 + fdct16_1d_sse2(in0, in1); 1.2547 + right_shift_16x16(in0, in1); 1.2548 + fdct16_1d_sse2(in0, in1); 1.2549 + break; 1.2550 + case 1: // ADST_DCT 1.2551 + fadst16_1d_sse2(in0, in1); 1.2552 + right_shift_16x16(in0, in1); 1.2553 + fdct16_1d_sse2(in0, in1); 1.2554 + break; 1.2555 + case 2: // DCT_ADST 1.2556 + fdct16_1d_sse2(in0, in1); 1.2557 + right_shift_16x16(in0, in1); 1.2558 + fadst16_1d_sse2(in0, in1); 1.2559 + break; 1.2560 + case 3: // ADST_ADST 1.2561 + fadst16_1d_sse2(in0, in1); 1.2562 + right_shift_16x16(in0, in1); 1.2563 + fadst16_1d_sse2(in0, in1); 1.2564 + break; 1.2565 + default: 1.2566 + assert(0); 1.2567 + break; 1.2568 + } 1.2569 + write_buffer_16x16(output, in0, in1, 16); 1.2570 +} 1.2571 + 1.2572 +#define FDCT32x32_2D vp9_fdct32x32_rd_sse2 1.2573 +#define FDCT32x32_HIGH_PRECISION 0 1.2574 +#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" 1.2575 +#undef FDCT32x32_2D 1.2576 +#undef FDCT32x32_HIGH_PRECISION 1.2577 + 1.2578 +#define FDCT32x32_2D vp9_fdct32x32_sse2 1.2579 +#define FDCT32x32_HIGH_PRECISION 1 1.2580 +#include "vp9/encoder/x86/vp9_dct32x32_sse2.c" // NOLINT 1.2581 +#undef FDCT32x32_2D 1.2582 +#undef FDCT32x32_HIGH_PRECISION