1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,3936 @@ 1.4 +/* 1.5 + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include <assert.h> 1.15 +#include <emmintrin.h> // SSE2 1.16 +#include "./vpx_config.h" 1.17 +#include "vpx/vpx_integer.h" 1.18 +#include "vp9/common/vp9_common.h" 1.19 +#include "vp9/common/vp9_idct.h" 1.20 + 1.21 +#define RECON_AND_STORE4X4(dest, in_x) \ 1.22 +{ \ 1.23 + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ 1.24 + d0 = _mm_unpacklo_epi8(d0, zero); \ 1.25 + d0 = _mm_add_epi16(in_x, d0); \ 1.26 + d0 = _mm_packus_epi16(d0, d0); \ 1.27 + *(int *)dest = _mm_cvtsi128_si32(d0); \ 1.28 + dest += stride; \ 1.29 +} 1.30 + 1.31 +void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 1.32 + const __m128i zero = _mm_setzero_si128(); 1.33 + const __m128i eight = _mm_set1_epi16(8); 1.34 + const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, 1.35 + (int16_t)cospi_16_64, (int16_t)-cospi_16_64, 1.36 + (int16_t)cospi_24_64, (int16_t)-cospi_8_64, 1.37 + (int16_t)cospi_8_64, (int16_t)cospi_24_64); 1.38 + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.39 + __m128i input0, input1, input2, input3; 1.40 + 1.41 + // Rows 1.42 + input0 = _mm_load_si128((const __m128i *)input); 1.43 + input2 = _mm_load_si128((const __m128i *)(input + 8)); 1.44 + 1.45 + // Construct i3, i1, i3, i1, i2, i0, i2, i0 1.46 + input0 = _mm_shufflelo_epi16(input0, 0xd8); 1.47 + input0 = _mm_shufflehi_epi16(input0, 0xd8); 1.48 + input2 = _mm_shufflelo_epi16(input2, 0xd8); 1.49 + input2 = _mm_shufflehi_epi16(input2, 0xd8); 1.50 + 1.51 + input1 = _mm_unpackhi_epi32(input0, input0); 1.52 + input0 = _mm_unpacklo_epi32(input0, input0); 1.53 + input3 = _mm_unpackhi_epi32(input2, input2); 1.54 + input2 = _mm_unpacklo_epi32(input2, input2); 1.55 + 1.56 + // Stage 1 1.57 + input0 = _mm_madd_epi16(input0, cst); 1.58 + input1 = _mm_madd_epi16(input1, cst); 1.59 + input2 = _mm_madd_epi16(input2, cst); 1.60 + input3 = _mm_madd_epi16(input3, cst); 1.61 + 1.62 + input0 = _mm_add_epi32(input0, rounding); 1.63 + input1 = _mm_add_epi32(input1, rounding); 1.64 + input2 = _mm_add_epi32(input2, rounding); 1.65 + input3 = _mm_add_epi32(input3, rounding); 1.66 + 1.67 + input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 1.68 + input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 1.69 + input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 1.70 + input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 1.71 + 1.72 + // Stage 2 1.73 + input0 = _mm_packs_epi32(input0, input1); 1.74 + input1 = _mm_packs_epi32(input2, input3); 1.75 + 1.76 + // Transpose 1.77 + input2 = _mm_unpacklo_epi16(input0, input1); 1.78 + input3 = _mm_unpackhi_epi16(input0, input1); 1.79 + input0 = _mm_unpacklo_epi32(input2, input3); 1.80 + input1 = _mm_unpackhi_epi32(input2, input3); 1.81 + 1.82 + // Switch column2, column 3, and then, we got: 1.83 + // input2: column1, column 0; input3: column2, column 3. 1.84 + input1 = _mm_shuffle_epi32(input1, 0x4e); 1.85 + input2 = _mm_add_epi16(input0, input1); 1.86 + input3 = _mm_sub_epi16(input0, input1); 1.87 + 1.88 + // Columns 1.89 + // Construct i3, i1, i3, i1, i2, i0, i2, i0 1.90 + input0 = _mm_unpacklo_epi32(input2, input2); 1.91 + input1 = _mm_unpackhi_epi32(input2, input2); 1.92 + input2 = _mm_unpackhi_epi32(input3, input3); 1.93 + input3 = _mm_unpacklo_epi32(input3, input3); 1.94 + 1.95 + // Stage 1 1.96 + input0 = _mm_madd_epi16(input0, cst); 1.97 + input1 = _mm_madd_epi16(input1, cst); 1.98 + input2 = _mm_madd_epi16(input2, cst); 1.99 + input3 = _mm_madd_epi16(input3, cst); 1.100 + 1.101 + input0 = _mm_add_epi32(input0, rounding); 1.102 + input1 = _mm_add_epi32(input1, rounding); 1.103 + input2 = _mm_add_epi32(input2, rounding); 1.104 + input3 = _mm_add_epi32(input3, rounding); 1.105 + 1.106 + input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); 1.107 + input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); 1.108 + input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); 1.109 + input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); 1.110 + 1.111 + // Stage 2 1.112 + input0 = _mm_packs_epi32(input0, input2); 1.113 + input1 = _mm_packs_epi32(input1, input3); 1.114 + 1.115 + // Transpose 1.116 + input2 = _mm_unpacklo_epi16(input0, input1); 1.117 + input3 = _mm_unpackhi_epi16(input0, input1); 1.118 + input0 = _mm_unpacklo_epi32(input2, input3); 1.119 + input1 = _mm_unpackhi_epi32(input2, input3); 1.120 + 1.121 + // Switch column2, column 3, and then, we got: 1.122 + // input2: column1, column 0; input3: column2, column 3. 1.123 + input1 = _mm_shuffle_epi32(input1, 0x4e); 1.124 + input2 = _mm_add_epi16(input0, input1); 1.125 + input3 = _mm_sub_epi16(input0, input1); 1.126 + 1.127 + // Final round and shift 1.128 + input2 = _mm_add_epi16(input2, eight); 1.129 + input3 = _mm_add_epi16(input3, eight); 1.130 + 1.131 + input2 = _mm_srai_epi16(input2, 4); 1.132 + input3 = _mm_srai_epi16(input3, 4); 1.133 + 1.134 + // Reconstruction and Store 1.135 + { 1.136 + __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); 1.137 + __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); 1.138 + d0 = _mm_unpacklo_epi32(d0, 1.139 + _mm_cvtsi32_si128(*(const int *) (dest + stride))); 1.140 + d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128( 1.141 + *(const int *) (dest + stride * 3)), d2); 1.142 + d0 = _mm_unpacklo_epi8(d0, zero); 1.143 + d2 = _mm_unpacklo_epi8(d2, zero); 1.144 + d0 = _mm_add_epi16(d0, input2); 1.145 + d2 = _mm_add_epi16(d2, input3); 1.146 + d0 = _mm_packus_epi16(d0, d2); 1.147 + // store input0 1.148 + *(int *)dest = _mm_cvtsi128_si32(d0); 1.149 + // store input1 1.150 + d0 = _mm_srli_si128(d0, 4); 1.151 + *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); 1.152 + // store input2 1.153 + d0 = _mm_srli_si128(d0, 4); 1.154 + *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); 1.155 + // store input3 1.156 + d0 = _mm_srli_si128(d0, 4); 1.157 + *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); 1.158 + } 1.159 +} 1.160 + 1.161 +void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 1.162 + __m128i dc_value; 1.163 + const __m128i zero = _mm_setzero_si128(); 1.164 + int a; 1.165 + 1.166 + a = dct_const_round_shift(input[0] * cospi_16_64); 1.167 + a = dct_const_round_shift(a * cospi_16_64); 1.168 + a = ROUND_POWER_OF_TWO(a, 4); 1.169 + 1.170 + dc_value = _mm_set1_epi16(a); 1.171 + 1.172 + RECON_AND_STORE4X4(dest, dc_value); 1.173 + RECON_AND_STORE4X4(dest, dc_value); 1.174 + RECON_AND_STORE4X4(dest, dc_value); 1.175 + RECON_AND_STORE4X4(dest, dc_value); 1.176 +} 1.177 + 1.178 +static INLINE void transpose_4x4(__m128i *res) { 1.179 + const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); 1.180 + const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]); 1.181 + res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); 1.182 + res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); 1.183 + 1.184 + res[1] = _mm_unpackhi_epi64(res[0], res[0]); 1.185 + res[3] = _mm_unpackhi_epi64(res[2], res[2]); 1.186 +} 1.187 + 1.188 +static void idct4_1d_sse2(__m128i *in) { 1.189 + const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); 1.190 + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.191 + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1.192 + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1.193 + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.194 + __m128i u[8], v[8]; 1.195 + 1.196 + transpose_4x4(in); 1.197 + // stage 1 1.198 + u[0] = _mm_unpacklo_epi16(in[0], in[2]); 1.199 + u[1] = _mm_unpacklo_epi16(in[1], in[3]); 1.200 + v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1.201 + v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 1.202 + v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 1.203 + v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 1.204 + 1.205 + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1.206 + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1.207 + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1.208 + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1.209 + 1.210 + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1.211 + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1.212 + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1.213 + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1.214 + 1.215 + u[0] = _mm_packs_epi32(v[0], v[2]); 1.216 + u[1] = _mm_packs_epi32(v[1], v[3]); 1.217 + u[2] = _mm_unpackhi_epi64(u[0], u[0]); 1.218 + u[3] = _mm_unpackhi_epi64(u[1], u[1]); 1.219 + 1.220 + // stage 2 1.221 + in[0] = _mm_add_epi16(u[0], u[3]); 1.222 + in[1] = _mm_add_epi16(u[1], u[2]); 1.223 + in[2] = _mm_sub_epi16(u[1], u[2]); 1.224 + in[3] = _mm_sub_epi16(u[0], u[3]); 1.225 +} 1.226 + 1.227 +static void iadst4_1d_sse2(__m128i *in) { 1.228 + const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); 1.229 + const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); 1.230 + const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); 1.231 + const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); 1.232 + const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); 1.233 + const __m128i kZero = _mm_set1_epi16(0); 1.234 + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.235 + __m128i u[8], v[8], in7; 1.236 + 1.237 + transpose_4x4(in); 1.238 + in7 = _mm_add_epi16(in[0], in[3]); 1.239 + in7 = _mm_sub_epi16(in7, in[2]); 1.240 + 1.241 + u[0] = _mm_unpacklo_epi16(in[0], in[2]); 1.242 + u[1] = _mm_unpacklo_epi16(in[1], in[3]); 1.243 + u[2] = _mm_unpacklo_epi16(in7, kZero); 1.244 + u[3] = _mm_unpacklo_epi16(in[1], kZero); 1.245 + 1.246 + v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 1.247 + v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 1.248 + v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 1.249 + v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 1.250 + v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 1.251 + v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 1.252 + 1.253 + u[0] = _mm_add_epi32(v[0], v[1]); 1.254 + u[1] = _mm_add_epi32(v[3], v[4]); 1.255 + u[2] = v[2]; 1.256 + u[3] = _mm_add_epi32(u[0], u[1]); 1.257 + u[4] = _mm_slli_epi32(v[5], 2); 1.258 + u[5] = _mm_add_epi32(u[3], v[5]); 1.259 + u[6] = _mm_sub_epi32(u[5], u[4]); 1.260 + 1.261 + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1.262 + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1.263 + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1.264 + v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1.265 + 1.266 + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1.267 + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1.268 + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1.269 + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1.270 + 1.271 + in[0] = _mm_packs_epi32(u[0], u[2]); 1.272 + in[1] = _mm_packs_epi32(u[1], u[3]); 1.273 + in[2] = _mm_unpackhi_epi64(in[0], in[0]); 1.274 + in[3] = _mm_unpackhi_epi64(in[1], in[1]); 1.275 +} 1.276 + 1.277 +void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, 1.278 + int tx_type) { 1.279 + __m128i in[4]; 1.280 + const __m128i zero = _mm_setzero_si128(); 1.281 + const __m128i eight = _mm_set1_epi16(8); 1.282 + 1.283 + in[0] = _mm_loadl_epi64((const __m128i *)input); 1.284 + in[1] = _mm_loadl_epi64((const __m128i *)(input + 4)); 1.285 + in[2] = _mm_loadl_epi64((const __m128i *)(input + 8)); 1.286 + in[3] = _mm_loadl_epi64((const __m128i *)(input + 12)); 1.287 + 1.288 + switch (tx_type) { 1.289 + case 0: // DCT_DCT 1.290 + idct4_1d_sse2(in); 1.291 + idct4_1d_sse2(in); 1.292 + break; 1.293 + case 1: // ADST_DCT 1.294 + idct4_1d_sse2(in); 1.295 + iadst4_1d_sse2(in); 1.296 + break; 1.297 + case 2: // DCT_ADST 1.298 + iadst4_1d_sse2(in); 1.299 + idct4_1d_sse2(in); 1.300 + break; 1.301 + case 3: // ADST_ADST 1.302 + iadst4_1d_sse2(in); 1.303 + iadst4_1d_sse2(in); 1.304 + break; 1.305 + default: 1.306 + assert(0); 1.307 + break; 1.308 + } 1.309 + 1.310 + // Final round and shift 1.311 + in[0] = _mm_add_epi16(in[0], eight); 1.312 + in[1] = _mm_add_epi16(in[1], eight); 1.313 + in[2] = _mm_add_epi16(in[2], eight); 1.314 + in[3] = _mm_add_epi16(in[3], eight); 1.315 + 1.316 + in[0] = _mm_srai_epi16(in[0], 4); 1.317 + in[1] = _mm_srai_epi16(in[1], 4); 1.318 + in[2] = _mm_srai_epi16(in[2], 4); 1.319 + in[3] = _mm_srai_epi16(in[3], 4); 1.320 + 1.321 + RECON_AND_STORE4X4(dest, in[0]); 1.322 + RECON_AND_STORE4X4(dest, in[1]); 1.323 + RECON_AND_STORE4X4(dest, in[2]); 1.324 + RECON_AND_STORE4X4(dest, in[3]); 1.325 +} 1.326 + 1.327 +#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ 1.328 + out0, out1, out2, out3, out4, out5, out6, out7) \ 1.329 + { \ 1.330 + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 1.331 + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 1.332 + const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ 1.333 + const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ 1.334 + const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ 1.335 + const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ 1.336 + const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ 1.337 + const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ 1.338 + \ 1.339 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 1.340 + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ 1.341 + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 1.342 + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ 1.343 + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 1.344 + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ 1.345 + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 1.346 + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ 1.347 + \ 1.348 + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 1.349 + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 1.350 + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 1.351 + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 1.352 + out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ 1.353 + out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ 1.354 + out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ 1.355 + out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ 1.356 + } 1.357 + 1.358 +#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \ 1.359 + out0, out1, out2, out3, out4, out5, out6, out7) \ 1.360 + { \ 1.361 + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 1.362 + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 1.363 + const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ 1.364 + const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ 1.365 + \ 1.366 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ 1.367 + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ 1.368 + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ 1.369 + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ 1.370 + \ 1.371 + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ 1.372 + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ 1.373 + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ 1.374 + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ 1.375 + out4 = out5 = out6 = out7 = zero; \ 1.376 + } 1.377 + 1.378 +#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \ 1.379 + { \ 1.380 + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ 1.381 + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ 1.382 + const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ 1.383 + const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ 1.384 + \ 1.385 + in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ 1.386 + in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ 1.387 + in2 = _mm_unpacklo_epi32(tr0_2, tr0_3); /* i5 i4 */ \ 1.388 + in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \ 1.389 + } 1.390 + 1.391 +// Define Macro for multiplying elements by constants and adding them together. 1.392 +#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ 1.393 + cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ 1.394 + { \ 1.395 + tmp0 = _mm_madd_epi16(lo_0, cst0); \ 1.396 + tmp1 = _mm_madd_epi16(hi_0, cst0); \ 1.397 + tmp2 = _mm_madd_epi16(lo_0, cst1); \ 1.398 + tmp3 = _mm_madd_epi16(hi_0, cst1); \ 1.399 + tmp4 = _mm_madd_epi16(lo_1, cst2); \ 1.400 + tmp5 = _mm_madd_epi16(hi_1, cst2); \ 1.401 + tmp6 = _mm_madd_epi16(lo_1, cst3); \ 1.402 + tmp7 = _mm_madd_epi16(hi_1, cst3); \ 1.403 + \ 1.404 + tmp0 = _mm_add_epi32(tmp0, rounding); \ 1.405 + tmp1 = _mm_add_epi32(tmp1, rounding); \ 1.406 + tmp2 = _mm_add_epi32(tmp2, rounding); \ 1.407 + tmp3 = _mm_add_epi32(tmp3, rounding); \ 1.408 + tmp4 = _mm_add_epi32(tmp4, rounding); \ 1.409 + tmp5 = _mm_add_epi32(tmp5, rounding); \ 1.410 + tmp6 = _mm_add_epi32(tmp6, rounding); \ 1.411 + tmp7 = _mm_add_epi32(tmp7, rounding); \ 1.412 + \ 1.413 + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 1.414 + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 1.415 + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 1.416 + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 1.417 + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ 1.418 + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ 1.419 + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ 1.420 + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ 1.421 + \ 1.422 + res0 = _mm_packs_epi32(tmp0, tmp1); \ 1.423 + res1 = _mm_packs_epi32(tmp2, tmp3); \ 1.424 + res2 = _mm_packs_epi32(tmp4, tmp5); \ 1.425 + res3 = _mm_packs_epi32(tmp6, tmp7); \ 1.426 + } 1.427 + 1.428 +#define IDCT8_1D \ 1.429 + /* Stage1 */ \ 1.430 + { \ 1.431 + const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ 1.432 + const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ 1.433 + const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ 1.434 + const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ 1.435 + \ 1.436 + MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ 1.437 + stg1_1, stg1_2, stg1_3, stp1_4, \ 1.438 + stp1_7, stp1_5, stp1_6) \ 1.439 + } \ 1.440 + \ 1.441 + /* Stage2 */ \ 1.442 + { \ 1.443 + const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ 1.444 + const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ 1.445 + const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ 1.446 + const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ 1.447 + \ 1.448 + MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ 1.449 + stg2_1, stg2_2, stg2_3, stp2_0, \ 1.450 + stp2_1, stp2_2, stp2_3) \ 1.451 + \ 1.452 + stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ 1.453 + stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ 1.454 + stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ 1.455 + stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ 1.456 + } \ 1.457 + \ 1.458 + /* Stage3 */ \ 1.459 + { \ 1.460 + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 1.461 + const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 1.462 + \ 1.463 + stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ 1.464 + stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ 1.465 + stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ 1.466 + stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ 1.467 + \ 1.468 + tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ 1.469 + tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ 1.470 + tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ 1.471 + tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ 1.472 + \ 1.473 + tmp0 = _mm_add_epi32(tmp0, rounding); \ 1.474 + tmp1 = _mm_add_epi32(tmp1, rounding); \ 1.475 + tmp2 = _mm_add_epi32(tmp2, rounding); \ 1.476 + tmp3 = _mm_add_epi32(tmp3, rounding); \ 1.477 + \ 1.478 + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 1.479 + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 1.480 + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 1.481 + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 1.482 + \ 1.483 + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 1.484 + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 1.485 + } \ 1.486 + \ 1.487 + /* Stage4 */ \ 1.488 + in0 = _mm_adds_epi16(stp1_0, stp2_7); \ 1.489 + in1 = _mm_adds_epi16(stp1_1, stp1_6); \ 1.490 + in2 = _mm_adds_epi16(stp1_2, stp1_5); \ 1.491 + in3 = _mm_adds_epi16(stp1_3, stp2_4); \ 1.492 + in4 = _mm_subs_epi16(stp1_3, stp2_4); \ 1.493 + in5 = _mm_subs_epi16(stp1_2, stp1_5); \ 1.494 + in6 = _mm_subs_epi16(stp1_1, stp1_6); \ 1.495 + in7 = _mm_subs_epi16(stp1_0, stp2_7); 1.496 + 1.497 +#define RECON_AND_STORE(dest, in_x) \ 1.498 + { \ 1.499 + __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ 1.500 + d0 = _mm_unpacklo_epi8(d0, zero); \ 1.501 + d0 = _mm_add_epi16(in_x, d0); \ 1.502 + d0 = _mm_packus_epi16(d0, d0); \ 1.503 + _mm_storel_epi64((__m128i *)(dest), d0); \ 1.504 + dest += stride; \ 1.505 + } 1.506 + 1.507 +void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 1.508 + const __m128i zero = _mm_setzero_si128(); 1.509 + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.510 + const __m128i final_rounding = _mm_set1_epi16(1<<4); 1.511 + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1.512 + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1.513 + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1.514 + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 1.515 + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1.516 + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.517 + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1.518 + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1.519 + 1.520 + __m128i in0, in1, in2, in3, in4, in5, in6, in7; 1.521 + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 1.522 + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 1.523 + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1.524 + int i; 1.525 + 1.526 + // Load input data. 1.527 + in0 = _mm_load_si128((const __m128i *)input); 1.528 + in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 1.529 + in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 1.530 + in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 1.531 + in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); 1.532 + in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); 1.533 + in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); 1.534 + in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); 1.535 + 1.536 + // 2-D 1.537 + for (i = 0; i < 2; i++) { 1.538 + // 8x8 Transpose is copied from vp9_fdct8x8_sse2() 1.539 + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 1.540 + in4, in5, in6, in7); 1.541 + 1.542 + // 4-stage 1D idct8x8 1.543 + IDCT8_1D 1.544 + } 1.545 + 1.546 + // Final rounding and shift 1.547 + in0 = _mm_adds_epi16(in0, final_rounding); 1.548 + in1 = _mm_adds_epi16(in1, final_rounding); 1.549 + in2 = _mm_adds_epi16(in2, final_rounding); 1.550 + in3 = _mm_adds_epi16(in3, final_rounding); 1.551 + in4 = _mm_adds_epi16(in4, final_rounding); 1.552 + in5 = _mm_adds_epi16(in5, final_rounding); 1.553 + in6 = _mm_adds_epi16(in6, final_rounding); 1.554 + in7 = _mm_adds_epi16(in7, final_rounding); 1.555 + 1.556 + in0 = _mm_srai_epi16(in0, 5); 1.557 + in1 = _mm_srai_epi16(in1, 5); 1.558 + in2 = _mm_srai_epi16(in2, 5); 1.559 + in3 = _mm_srai_epi16(in3, 5); 1.560 + in4 = _mm_srai_epi16(in4, 5); 1.561 + in5 = _mm_srai_epi16(in5, 5); 1.562 + in6 = _mm_srai_epi16(in6, 5); 1.563 + in7 = _mm_srai_epi16(in7, 5); 1.564 + 1.565 + RECON_AND_STORE(dest, in0); 1.566 + RECON_AND_STORE(dest, in1); 1.567 + RECON_AND_STORE(dest, in2); 1.568 + RECON_AND_STORE(dest, in3); 1.569 + RECON_AND_STORE(dest, in4); 1.570 + RECON_AND_STORE(dest, in5); 1.571 + RECON_AND_STORE(dest, in6); 1.572 + RECON_AND_STORE(dest, in7); 1.573 +} 1.574 + 1.575 +void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 1.576 + __m128i dc_value; 1.577 + const __m128i zero = _mm_setzero_si128(); 1.578 + int a; 1.579 + 1.580 + a = dct_const_round_shift(input[0] * cospi_16_64); 1.581 + a = dct_const_round_shift(a * cospi_16_64); 1.582 + a = ROUND_POWER_OF_TWO(a, 5); 1.583 + 1.584 + dc_value = _mm_set1_epi16(a); 1.585 + 1.586 + RECON_AND_STORE(dest, dc_value); 1.587 + RECON_AND_STORE(dest, dc_value); 1.588 + RECON_AND_STORE(dest, dc_value); 1.589 + RECON_AND_STORE(dest, dc_value); 1.590 + RECON_AND_STORE(dest, dc_value); 1.591 + RECON_AND_STORE(dest, dc_value); 1.592 + RECON_AND_STORE(dest, dc_value); 1.593 + RECON_AND_STORE(dest, dc_value); 1.594 +} 1.595 + 1.596 +// perform 8x8 transpose 1.597 +static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { 1.598 + const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); 1.599 + const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); 1.600 + const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); 1.601 + const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); 1.602 + const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); 1.603 + const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); 1.604 + const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); 1.605 + const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); 1.606 + 1.607 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 1.608 + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); 1.609 + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 1.610 + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); 1.611 + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); 1.612 + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 1.613 + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); 1.614 + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 1.615 + 1.616 + res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); 1.617 + res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); 1.618 + res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); 1.619 + res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); 1.620 + res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); 1.621 + res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); 1.622 + res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); 1.623 + res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); 1.624 +} 1.625 + 1.626 +static void idct8_1d_sse2(__m128i *in) { 1.627 + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.628 + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1.629 + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1.630 + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1.631 + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 1.632 + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1.633 + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.634 + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1.635 + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1.636 + 1.637 + __m128i in0, in1, in2, in3, in4, in5, in6, in7; 1.638 + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 1.639 + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 1.640 + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1.641 + 1.642 + in0 = in[0]; 1.643 + in1 = in[1]; 1.644 + in2 = in[2]; 1.645 + in3 = in[3]; 1.646 + in4 = in[4]; 1.647 + in5 = in[5]; 1.648 + in6 = in[6]; 1.649 + in7 = in[7]; 1.650 + 1.651 + // 8x8 Transpose is copied from vp9_fdct8x8_sse2() 1.652 + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 1.653 + in4, in5, in6, in7); 1.654 + 1.655 + // 4-stage 1D idct8x8 1.656 + IDCT8_1D 1.657 + in[0] = in0; 1.658 + in[1] = in1; 1.659 + in[2] = in2; 1.660 + in[3] = in3; 1.661 + in[4] = in4; 1.662 + in[5] = in5; 1.663 + in[6] = in6; 1.664 + in[7] = in7; 1.665 +} 1.666 + 1.667 +static void iadst8_1d_sse2(__m128i *in) { 1.668 + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 1.669 + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1.670 + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 1.671 + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1.672 + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 1.673 + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1.674 + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 1.675 + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1.676 + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1.677 + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1.678 + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 1.679 + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.680 + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1.681 + const __m128i k__const_0 = _mm_set1_epi16(0); 1.682 + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.683 + 1.684 + __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; 1.685 + __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; 1.686 + __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; 1.687 + __m128i s0, s1, s2, s3, s4, s5, s6, s7; 1.688 + __m128i in0, in1, in2, in3, in4, in5, in6, in7; 1.689 + 1.690 + // transpose 1.691 + array_transpose_8x8(in, in); 1.692 + 1.693 + // properly aligned for butterfly input 1.694 + in0 = in[7]; 1.695 + in1 = in[0]; 1.696 + in2 = in[5]; 1.697 + in3 = in[2]; 1.698 + in4 = in[3]; 1.699 + in5 = in[4]; 1.700 + in6 = in[1]; 1.701 + in7 = in[6]; 1.702 + 1.703 + // column transformation 1.704 + // stage 1 1.705 + // interleave and multiply/add into 32-bit integer 1.706 + s0 = _mm_unpacklo_epi16(in0, in1); 1.707 + s1 = _mm_unpackhi_epi16(in0, in1); 1.708 + s2 = _mm_unpacklo_epi16(in2, in3); 1.709 + s3 = _mm_unpackhi_epi16(in2, in3); 1.710 + s4 = _mm_unpacklo_epi16(in4, in5); 1.711 + s5 = _mm_unpackhi_epi16(in4, in5); 1.712 + s6 = _mm_unpacklo_epi16(in6, in7); 1.713 + s7 = _mm_unpackhi_epi16(in6, in7); 1.714 + 1.715 + u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); 1.716 + u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); 1.717 + u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); 1.718 + u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); 1.719 + u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); 1.720 + u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); 1.721 + u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); 1.722 + u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); 1.723 + u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); 1.724 + u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); 1.725 + u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); 1.726 + u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); 1.727 + u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); 1.728 + u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); 1.729 + u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); 1.730 + u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); 1.731 + 1.732 + // addition 1.733 + w0 = _mm_add_epi32(u0, u8); 1.734 + w1 = _mm_add_epi32(u1, u9); 1.735 + w2 = _mm_add_epi32(u2, u10); 1.736 + w3 = _mm_add_epi32(u3, u11); 1.737 + w4 = _mm_add_epi32(u4, u12); 1.738 + w5 = _mm_add_epi32(u5, u13); 1.739 + w6 = _mm_add_epi32(u6, u14); 1.740 + w7 = _mm_add_epi32(u7, u15); 1.741 + w8 = _mm_sub_epi32(u0, u8); 1.742 + w9 = _mm_sub_epi32(u1, u9); 1.743 + w10 = _mm_sub_epi32(u2, u10); 1.744 + w11 = _mm_sub_epi32(u3, u11); 1.745 + w12 = _mm_sub_epi32(u4, u12); 1.746 + w13 = _mm_sub_epi32(u5, u13); 1.747 + w14 = _mm_sub_epi32(u6, u14); 1.748 + w15 = _mm_sub_epi32(u7, u15); 1.749 + 1.750 + // shift and rounding 1.751 + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 1.752 + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 1.753 + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 1.754 + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 1.755 + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 1.756 + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 1.757 + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 1.758 + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 1.759 + v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); 1.760 + v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); 1.761 + v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); 1.762 + v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); 1.763 + v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); 1.764 + v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); 1.765 + v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); 1.766 + v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); 1.767 + 1.768 + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1.769 + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1.770 + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1.771 + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1.772 + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1.773 + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1.774 + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1.775 + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1.776 + u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); 1.777 + u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); 1.778 + u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); 1.779 + u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); 1.780 + u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); 1.781 + u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); 1.782 + u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); 1.783 + u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); 1.784 + 1.785 + // back to 16-bit and pack 8 integers into __m128i 1.786 + in[0] = _mm_packs_epi32(u0, u1); 1.787 + in[1] = _mm_packs_epi32(u2, u3); 1.788 + in[2] = _mm_packs_epi32(u4, u5); 1.789 + in[3] = _mm_packs_epi32(u6, u7); 1.790 + in[4] = _mm_packs_epi32(u8, u9); 1.791 + in[5] = _mm_packs_epi32(u10, u11); 1.792 + in[6] = _mm_packs_epi32(u12, u13); 1.793 + in[7] = _mm_packs_epi32(u14, u15); 1.794 + 1.795 + // stage 2 1.796 + s0 = _mm_add_epi16(in[0], in[2]); 1.797 + s1 = _mm_add_epi16(in[1], in[3]); 1.798 + s2 = _mm_sub_epi16(in[0], in[2]); 1.799 + s3 = _mm_sub_epi16(in[1], in[3]); 1.800 + u0 = _mm_unpacklo_epi16(in[4], in[5]); 1.801 + u1 = _mm_unpackhi_epi16(in[4], in[5]); 1.802 + u2 = _mm_unpacklo_epi16(in[6], in[7]); 1.803 + u3 = _mm_unpackhi_epi16(in[6], in[7]); 1.804 + 1.805 + v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); 1.806 + v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); 1.807 + v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); 1.808 + v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); 1.809 + v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); 1.810 + v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); 1.811 + v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); 1.812 + v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); 1.813 + 1.814 + w0 = _mm_add_epi32(v0, v4); 1.815 + w1 = _mm_add_epi32(v1, v5); 1.816 + w2 = _mm_add_epi32(v2, v6); 1.817 + w3 = _mm_add_epi32(v3, v7); 1.818 + w4 = _mm_sub_epi32(v0, v4); 1.819 + w5 = _mm_sub_epi32(v1, v5); 1.820 + w6 = _mm_sub_epi32(v2, v6); 1.821 + w7 = _mm_sub_epi32(v3, v7); 1.822 + 1.823 + v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); 1.824 + v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); 1.825 + v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); 1.826 + v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); 1.827 + v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); 1.828 + v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); 1.829 + v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); 1.830 + v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); 1.831 + 1.832 + u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); 1.833 + u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); 1.834 + u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); 1.835 + u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); 1.836 + u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); 1.837 + u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); 1.838 + u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); 1.839 + u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); 1.840 + 1.841 + // back to 16-bit intergers 1.842 + s4 = _mm_packs_epi32(u0, u1); 1.843 + s5 = _mm_packs_epi32(u2, u3); 1.844 + s6 = _mm_packs_epi32(u4, u5); 1.845 + s7 = _mm_packs_epi32(u6, u7); 1.846 + 1.847 + // stage 3 1.848 + u0 = _mm_unpacklo_epi16(s2, s3); 1.849 + u1 = _mm_unpackhi_epi16(s2, s3); 1.850 + u2 = _mm_unpacklo_epi16(s6, s7); 1.851 + u3 = _mm_unpackhi_epi16(s6, s7); 1.852 + 1.853 + v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); 1.854 + v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); 1.855 + v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); 1.856 + v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); 1.857 + v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); 1.858 + v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); 1.859 + v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); 1.860 + v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); 1.861 + 1.862 + u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); 1.863 + u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); 1.864 + u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); 1.865 + u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); 1.866 + u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); 1.867 + u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); 1.868 + u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); 1.869 + u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); 1.870 + 1.871 + v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); 1.872 + v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); 1.873 + v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); 1.874 + v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); 1.875 + v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); 1.876 + v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); 1.877 + v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); 1.878 + v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); 1.879 + 1.880 + s2 = _mm_packs_epi32(v0, v1); 1.881 + s3 = _mm_packs_epi32(v2, v3); 1.882 + s6 = _mm_packs_epi32(v4, v5); 1.883 + s7 = _mm_packs_epi32(v6, v7); 1.884 + 1.885 + in[0] = s0; 1.886 + in[1] = _mm_sub_epi16(k__const_0, s4); 1.887 + in[2] = s6; 1.888 + in[3] = _mm_sub_epi16(k__const_0, s2); 1.889 + in[4] = s3; 1.890 + in[5] = _mm_sub_epi16(k__const_0, s7); 1.891 + in[6] = s5; 1.892 + in[7] = _mm_sub_epi16(k__const_0, s1); 1.893 +} 1.894 + 1.895 + 1.896 +void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, 1.897 + int tx_type) { 1.898 + __m128i in[8]; 1.899 + const __m128i zero = _mm_setzero_si128(); 1.900 + const __m128i final_rounding = _mm_set1_epi16(1<<4); 1.901 + 1.902 + // load input data 1.903 + in[0] = _mm_load_si128((const __m128i *)input); 1.904 + in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1)); 1.905 + in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2)); 1.906 + in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3)); 1.907 + in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4)); 1.908 + in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5)); 1.909 + in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6)); 1.910 + in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); 1.911 + 1.912 + switch (tx_type) { 1.913 + case 0: // DCT_DCT 1.914 + idct8_1d_sse2(in); 1.915 + idct8_1d_sse2(in); 1.916 + break; 1.917 + case 1: // ADST_DCT 1.918 + idct8_1d_sse2(in); 1.919 + iadst8_1d_sse2(in); 1.920 + break; 1.921 + case 2: // DCT_ADST 1.922 + iadst8_1d_sse2(in); 1.923 + idct8_1d_sse2(in); 1.924 + break; 1.925 + case 3: // ADST_ADST 1.926 + iadst8_1d_sse2(in); 1.927 + iadst8_1d_sse2(in); 1.928 + break; 1.929 + default: 1.930 + assert(0); 1.931 + break; 1.932 + } 1.933 + 1.934 + // Final rounding and shift 1.935 + in[0] = _mm_adds_epi16(in[0], final_rounding); 1.936 + in[1] = _mm_adds_epi16(in[1], final_rounding); 1.937 + in[2] = _mm_adds_epi16(in[2], final_rounding); 1.938 + in[3] = _mm_adds_epi16(in[3], final_rounding); 1.939 + in[4] = _mm_adds_epi16(in[4], final_rounding); 1.940 + in[5] = _mm_adds_epi16(in[5], final_rounding); 1.941 + in[6] = _mm_adds_epi16(in[6], final_rounding); 1.942 + in[7] = _mm_adds_epi16(in[7], final_rounding); 1.943 + 1.944 + in[0] = _mm_srai_epi16(in[0], 5); 1.945 + in[1] = _mm_srai_epi16(in[1], 5); 1.946 + in[2] = _mm_srai_epi16(in[2], 5); 1.947 + in[3] = _mm_srai_epi16(in[3], 5); 1.948 + in[4] = _mm_srai_epi16(in[4], 5); 1.949 + in[5] = _mm_srai_epi16(in[5], 5); 1.950 + in[6] = _mm_srai_epi16(in[6], 5); 1.951 + in[7] = _mm_srai_epi16(in[7], 5); 1.952 + 1.953 + RECON_AND_STORE(dest, in[0]); 1.954 + RECON_AND_STORE(dest, in[1]); 1.955 + RECON_AND_STORE(dest, in[2]); 1.956 + RECON_AND_STORE(dest, in[3]); 1.957 + RECON_AND_STORE(dest, in[4]); 1.958 + RECON_AND_STORE(dest, in[5]); 1.959 + RECON_AND_STORE(dest, in[6]); 1.960 + RECON_AND_STORE(dest, in[7]); 1.961 +} 1.962 + 1.963 +void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 1.964 + const __m128i zero = _mm_setzero_si128(); 1.965 + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.966 + const __m128i final_rounding = _mm_set1_epi16(1<<4); 1.967 + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1.968 + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1.969 + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1.970 + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); 1.971 + const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1.972 + const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.973 + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1.974 + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1.975 + const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1.976 + 1.977 + __m128i in0, in1, in2, in3, in4, in5, in6, in7; 1.978 + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; 1.979 + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; 1.980 + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1.981 + 1.982 + // Rows. Load 4-row input data. 1.983 + in0 = _mm_load_si128((const __m128i *)input); 1.984 + in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 1.985 + in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 1.986 + in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 1.987 + 1.988 + // 8x4 Transpose 1.989 + TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3) 1.990 + 1.991 + // Stage1 1.992 + { //NOLINT 1.993 + const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3); 1.994 + const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2); 1.995 + 1.996 + tmp0 = _mm_madd_epi16(lo_17, stg1_0); 1.997 + tmp2 = _mm_madd_epi16(lo_17, stg1_1); 1.998 + tmp4 = _mm_madd_epi16(lo_35, stg1_2); 1.999 + tmp6 = _mm_madd_epi16(lo_35, stg1_3); 1.1000 + 1.1001 + tmp0 = _mm_add_epi32(tmp0, rounding); 1.1002 + tmp2 = _mm_add_epi32(tmp2, rounding); 1.1003 + tmp4 = _mm_add_epi32(tmp4, rounding); 1.1004 + tmp6 = _mm_add_epi32(tmp6, rounding); 1.1005 + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1.1006 + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1.1007 + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1.1008 + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1.1009 + 1.1010 + stp1_4 = _mm_packs_epi32(tmp0, zero); 1.1011 + stp1_7 = _mm_packs_epi32(tmp2, zero); 1.1012 + stp1_5 = _mm_packs_epi32(tmp4, zero); 1.1013 + stp1_6 = _mm_packs_epi32(tmp6, zero); 1.1014 + } 1.1015 + 1.1016 + // Stage2 1.1017 + { //NOLINT 1.1018 + const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2); 1.1019 + const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3); 1.1020 + 1.1021 + tmp0 = _mm_madd_epi16(lo_04, stg2_0); 1.1022 + tmp2 = _mm_madd_epi16(lo_04, stg2_1); 1.1023 + tmp4 = _mm_madd_epi16(lo_26, stg2_2); 1.1024 + tmp6 = _mm_madd_epi16(lo_26, stg2_3); 1.1025 + 1.1026 + tmp0 = _mm_add_epi32(tmp0, rounding); 1.1027 + tmp2 = _mm_add_epi32(tmp2, rounding); 1.1028 + tmp4 = _mm_add_epi32(tmp4, rounding); 1.1029 + tmp6 = _mm_add_epi32(tmp6, rounding); 1.1030 + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1.1031 + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1.1032 + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1.1033 + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1.1034 + 1.1035 + stp2_0 = _mm_packs_epi32(tmp0, zero); 1.1036 + stp2_1 = _mm_packs_epi32(tmp2, zero); 1.1037 + stp2_2 = _mm_packs_epi32(tmp4, zero); 1.1038 + stp2_3 = _mm_packs_epi32(tmp6, zero); 1.1039 + 1.1040 + stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); 1.1041 + stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); 1.1042 + stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); 1.1043 + stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); 1.1044 + } 1.1045 + 1.1046 + // Stage3 1.1047 + { //NOLINT 1.1048 + const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); 1.1049 + stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); 1.1050 + stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); 1.1051 + stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); 1.1052 + stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); 1.1053 + 1.1054 + tmp0 = _mm_madd_epi16(lo_56, stg3_0); 1.1055 + tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 1.1056 + 1.1057 + tmp0 = _mm_add_epi32(tmp0, rounding); 1.1058 + tmp2 = _mm_add_epi32(tmp2, rounding); 1.1059 + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1.1060 + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1.1061 + 1.1062 + stp1_5 = _mm_packs_epi32(tmp0, zero); 1.1063 + stp1_6 = _mm_packs_epi32(tmp2, zero); 1.1064 + } 1.1065 + 1.1066 + // Stage4 1.1067 + in0 = _mm_adds_epi16(stp1_0, stp2_7); 1.1068 + in1 = _mm_adds_epi16(stp1_1, stp1_6); 1.1069 + in2 = _mm_adds_epi16(stp1_2, stp1_5); 1.1070 + in3 = _mm_adds_epi16(stp1_3, stp2_4); 1.1071 + in4 = _mm_subs_epi16(stp1_3, stp2_4); 1.1072 + in5 = _mm_subs_epi16(stp1_2, stp1_5); 1.1073 + in6 = _mm_subs_epi16(stp1_1, stp1_6); 1.1074 + in7 = _mm_subs_epi16(stp1_0, stp2_7); 1.1075 + 1.1076 + // Columns. 4x8 Transpose 1.1077 + TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 1.1078 + in4, in5, in6, in7) 1.1079 + 1.1080 + // 1D idct8x8 1.1081 + IDCT8_1D 1.1082 + 1.1083 + // Final rounding and shift 1.1084 + in0 = _mm_adds_epi16(in0, final_rounding); 1.1085 + in1 = _mm_adds_epi16(in1, final_rounding); 1.1086 + in2 = _mm_adds_epi16(in2, final_rounding); 1.1087 + in3 = _mm_adds_epi16(in3, final_rounding); 1.1088 + in4 = _mm_adds_epi16(in4, final_rounding); 1.1089 + in5 = _mm_adds_epi16(in5, final_rounding); 1.1090 + in6 = _mm_adds_epi16(in6, final_rounding); 1.1091 + in7 = _mm_adds_epi16(in7, final_rounding); 1.1092 + 1.1093 + in0 = _mm_srai_epi16(in0, 5); 1.1094 + in1 = _mm_srai_epi16(in1, 5); 1.1095 + in2 = _mm_srai_epi16(in2, 5); 1.1096 + in3 = _mm_srai_epi16(in3, 5); 1.1097 + in4 = _mm_srai_epi16(in4, 5); 1.1098 + in5 = _mm_srai_epi16(in5, 5); 1.1099 + in6 = _mm_srai_epi16(in6, 5); 1.1100 + in7 = _mm_srai_epi16(in7, 5); 1.1101 + 1.1102 + RECON_AND_STORE(dest, in0); 1.1103 + RECON_AND_STORE(dest, in1); 1.1104 + RECON_AND_STORE(dest, in2); 1.1105 + RECON_AND_STORE(dest, in3); 1.1106 + RECON_AND_STORE(dest, in4); 1.1107 + RECON_AND_STORE(dest, in5); 1.1108 + RECON_AND_STORE(dest, in6); 1.1109 + RECON_AND_STORE(dest, in7); 1.1110 +} 1.1111 + 1.1112 +#define IDCT16_1D \ 1.1113 + /* Stage2 */ \ 1.1114 + { \ 1.1115 + const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ 1.1116 + const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \ 1.1117 + const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \ 1.1118 + const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \ 1.1119 + const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \ 1.1120 + const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \ 1.1121 + const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \ 1.1122 + const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \ 1.1123 + \ 1.1124 + MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ 1.1125 + stg2_0, stg2_1, stg2_2, stg2_3, \ 1.1126 + stp2_8, stp2_15, stp2_9, stp2_14) \ 1.1127 + \ 1.1128 + MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ 1.1129 + stg2_4, stg2_5, stg2_6, stg2_7, \ 1.1130 + stp2_10, stp2_13, stp2_11, stp2_12) \ 1.1131 + } \ 1.1132 + \ 1.1133 + /* Stage3 */ \ 1.1134 + { \ 1.1135 + const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \ 1.1136 + const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \ 1.1137 + const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \ 1.1138 + const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \ 1.1139 + \ 1.1140 + MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ 1.1141 + stg3_0, stg3_1, stg3_2, stg3_3, \ 1.1142 + stp1_4, stp1_7, stp1_5, stp1_6) \ 1.1143 + \ 1.1144 + stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ 1.1145 + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 1.1146 + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 1.1147 + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 1.1148 + \ 1.1149 + stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ 1.1150 + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 1.1151 + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 1.1152 + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 1.1153 + } \ 1.1154 + \ 1.1155 + /* Stage4 */ \ 1.1156 + { \ 1.1157 + const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \ 1.1158 + const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \ 1.1159 + const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \ 1.1160 + const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \ 1.1161 + \ 1.1162 + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 1.1163 + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 1.1164 + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1.1165 + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1.1166 + \ 1.1167 + MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ 1.1168 + stg4_0, stg4_1, stg4_2, stg4_3, \ 1.1169 + stp2_0, stp2_1, stp2_2, stp2_3) \ 1.1170 + \ 1.1171 + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 1.1172 + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 1.1173 + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 1.1174 + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 1.1175 + \ 1.1176 + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ 1.1177 + stg4_4, stg4_5, stg4_6, stg4_7, \ 1.1178 + stp2_9, stp2_14, stp2_10, stp2_13) \ 1.1179 + } \ 1.1180 + \ 1.1181 + /* Stage5 */ \ 1.1182 + { \ 1.1183 + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 1.1184 + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 1.1185 + \ 1.1186 + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 1.1187 + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 1.1188 + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 1.1189 + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 1.1190 + \ 1.1191 + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 1.1192 + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 1.1193 + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 1.1194 + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 1.1195 + \ 1.1196 + tmp0 = _mm_add_epi32(tmp0, rounding); \ 1.1197 + tmp1 = _mm_add_epi32(tmp1, rounding); \ 1.1198 + tmp2 = _mm_add_epi32(tmp2, rounding); \ 1.1199 + tmp3 = _mm_add_epi32(tmp3, rounding); \ 1.1200 + \ 1.1201 + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 1.1202 + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 1.1203 + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 1.1204 + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 1.1205 + \ 1.1206 + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 1.1207 + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 1.1208 + \ 1.1209 + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ 1.1210 + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 1.1211 + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 1.1212 + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ 1.1213 + \ 1.1214 + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ 1.1215 + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 1.1216 + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 1.1217 + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ 1.1218 + } \ 1.1219 + \ 1.1220 + /* Stage6 */ \ 1.1221 + { \ 1.1222 + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1.1223 + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1.1224 + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 1.1225 + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 1.1226 + \ 1.1227 + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ 1.1228 + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 1.1229 + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 1.1230 + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ 1.1231 + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ 1.1232 + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 1.1233 + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 1.1234 + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ 1.1235 + \ 1.1236 + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 1.1237 + stg6_0, stg4_0, stg6_0, stg4_0, \ 1.1238 + stp2_10, stp2_13, stp2_11, stp2_12) \ 1.1239 + } 1.1240 + 1.1241 +void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, 1.1242 + int stride) { 1.1243 + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.1244 + const __m128i final_rounding = _mm_set1_epi16(1<<5); 1.1245 + const __m128i zero = _mm_setzero_si128(); 1.1246 + 1.1247 + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1.1248 + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 1.1249 + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1.1250 + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 1.1251 + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1.1252 + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 1.1253 + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1.1254 + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 1.1255 + 1.1256 + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1.1257 + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1.1258 + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1.1259 + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 1.1260 + 1.1261 + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1.1262 + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.1263 + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1.1264 + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1.1265 + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1.1266 + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 1.1267 + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1.1268 + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1.1269 + 1.1270 + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1.1271 + 1.1272 + __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, 1.1273 + in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, 1.1274 + in10 = zero, in11 = zero, in12 = zero, in13 = zero, 1.1275 + in14 = zero, in15 = zero; 1.1276 + __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, 1.1277 + l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, 1.1278 + l12 = zero, l13 = zero, l14 = zero, l15 = zero; 1.1279 + __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero, 1.1280 + r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero, 1.1281 + r12 = zero, r13 = zero, r14 = zero, r15 = zero; 1.1282 + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 1.1283 + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 1.1284 + stp1_8_0, stp1_12_0; 1.1285 + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 1.1286 + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 1.1287 + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1.1288 + int i; 1.1289 + 1.1290 + // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct. 1.1291 + for (i = 0; i < 4; i++) { 1.1292 + // 1-D idct 1.1293 + if (i < 2) { 1.1294 + if (i == 1) input += 128; 1.1295 + 1.1296 + // Load input data. 1.1297 + in0 = _mm_load_si128((const __m128i *)input); 1.1298 + in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 1.1299 + in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 1.1300 + in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 1.1301 + in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); 1.1302 + in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); 1.1303 + in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); 1.1304 + in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); 1.1305 + in4 = _mm_load_si128((const __m128i *)(input + 8 * 8)); 1.1306 + in12 = _mm_load_si128((const __m128i *)(input + 8 * 9)); 1.1307 + in5 = _mm_load_si128((const __m128i *)(input + 8 * 10)); 1.1308 + in13 = _mm_load_si128((const __m128i *)(input + 8 * 11)); 1.1309 + in6 = _mm_load_si128((const __m128i *)(input + 8 * 12)); 1.1310 + in14 = _mm_load_si128((const __m128i *)(input + 8 * 13)); 1.1311 + in7 = _mm_load_si128((const __m128i *)(input + 8 * 14)); 1.1312 + in15 = _mm_load_si128((const __m128i *)(input + 8 * 15)); 1.1313 + 1.1314 + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 1.1315 + in4, in5, in6, in7); 1.1316 + TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, 1.1317 + in10, in11, in12, in13, in14, in15); 1.1318 + } 1.1319 + 1.1320 + if (i == 2) { 1.1321 + TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, 1.1322 + in5, in6, in7); 1.1323 + TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12, 1.1324 + in13, in14, in15); 1.1325 + } 1.1326 + 1.1327 + if (i == 3) { 1.1328 + TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, 1.1329 + in4, in5, in6, in7); 1.1330 + TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11, 1.1331 + in12, in13, in14, in15); 1.1332 + } 1.1333 + 1.1334 + IDCT16_1D 1.1335 + 1.1336 + // Stage7 1.1337 + if (i == 0) { 1.1338 + // Left 8x16 1.1339 + l0 = _mm_add_epi16(stp2_0, stp1_15); 1.1340 + l1 = _mm_add_epi16(stp2_1, stp1_14); 1.1341 + l2 = _mm_add_epi16(stp2_2, stp2_13); 1.1342 + l3 = _mm_add_epi16(stp2_3, stp2_12); 1.1343 + l4 = _mm_add_epi16(stp2_4, stp2_11); 1.1344 + l5 = _mm_add_epi16(stp2_5, stp2_10); 1.1345 + l6 = _mm_add_epi16(stp2_6, stp1_9); 1.1346 + l7 = _mm_add_epi16(stp2_7, stp1_8); 1.1347 + l8 = _mm_sub_epi16(stp2_7, stp1_8); 1.1348 + l9 = _mm_sub_epi16(stp2_6, stp1_9); 1.1349 + l10 = _mm_sub_epi16(stp2_5, stp2_10); 1.1350 + l11 = _mm_sub_epi16(stp2_4, stp2_11); 1.1351 + l12 = _mm_sub_epi16(stp2_3, stp2_12); 1.1352 + l13 = _mm_sub_epi16(stp2_2, stp2_13); 1.1353 + l14 = _mm_sub_epi16(stp2_1, stp1_14); 1.1354 + l15 = _mm_sub_epi16(stp2_0, stp1_15); 1.1355 + } else if (i == 1) { 1.1356 + // Right 8x16 1.1357 + r0 = _mm_add_epi16(stp2_0, stp1_15); 1.1358 + r1 = _mm_add_epi16(stp2_1, stp1_14); 1.1359 + r2 = _mm_add_epi16(stp2_2, stp2_13); 1.1360 + r3 = _mm_add_epi16(stp2_3, stp2_12); 1.1361 + r4 = _mm_add_epi16(stp2_4, stp2_11); 1.1362 + r5 = _mm_add_epi16(stp2_5, stp2_10); 1.1363 + r6 = _mm_add_epi16(stp2_6, stp1_9); 1.1364 + r7 = _mm_add_epi16(stp2_7, stp1_8); 1.1365 + r8 = _mm_sub_epi16(stp2_7, stp1_8); 1.1366 + r9 = _mm_sub_epi16(stp2_6, stp1_9); 1.1367 + r10 = _mm_sub_epi16(stp2_5, stp2_10); 1.1368 + r11 = _mm_sub_epi16(stp2_4, stp2_11); 1.1369 + r12 = _mm_sub_epi16(stp2_3, stp2_12); 1.1370 + r13 = _mm_sub_epi16(stp2_2, stp2_13); 1.1371 + r14 = _mm_sub_epi16(stp2_1, stp1_14); 1.1372 + r15 = _mm_sub_epi16(stp2_0, stp1_15); 1.1373 + } else { 1.1374 + // 2-D 1.1375 + in0 = _mm_add_epi16(stp2_0, stp1_15); 1.1376 + in1 = _mm_add_epi16(stp2_1, stp1_14); 1.1377 + in2 = _mm_add_epi16(stp2_2, stp2_13); 1.1378 + in3 = _mm_add_epi16(stp2_3, stp2_12); 1.1379 + in4 = _mm_add_epi16(stp2_4, stp2_11); 1.1380 + in5 = _mm_add_epi16(stp2_5, stp2_10); 1.1381 + in6 = _mm_add_epi16(stp2_6, stp1_9); 1.1382 + in7 = _mm_add_epi16(stp2_7, stp1_8); 1.1383 + in8 = _mm_sub_epi16(stp2_7, stp1_8); 1.1384 + in9 = _mm_sub_epi16(stp2_6, stp1_9); 1.1385 + in10 = _mm_sub_epi16(stp2_5, stp2_10); 1.1386 + in11 = _mm_sub_epi16(stp2_4, stp2_11); 1.1387 + in12 = _mm_sub_epi16(stp2_3, stp2_12); 1.1388 + in13 = _mm_sub_epi16(stp2_2, stp2_13); 1.1389 + in14 = _mm_sub_epi16(stp2_1, stp1_14); 1.1390 + in15 = _mm_sub_epi16(stp2_0, stp1_15); 1.1391 + 1.1392 + // Final rounding and shift 1.1393 + in0 = _mm_adds_epi16(in0, final_rounding); 1.1394 + in1 = _mm_adds_epi16(in1, final_rounding); 1.1395 + in2 = _mm_adds_epi16(in2, final_rounding); 1.1396 + in3 = _mm_adds_epi16(in3, final_rounding); 1.1397 + in4 = _mm_adds_epi16(in4, final_rounding); 1.1398 + in5 = _mm_adds_epi16(in5, final_rounding); 1.1399 + in6 = _mm_adds_epi16(in6, final_rounding); 1.1400 + in7 = _mm_adds_epi16(in7, final_rounding); 1.1401 + in8 = _mm_adds_epi16(in8, final_rounding); 1.1402 + in9 = _mm_adds_epi16(in9, final_rounding); 1.1403 + in10 = _mm_adds_epi16(in10, final_rounding); 1.1404 + in11 = _mm_adds_epi16(in11, final_rounding); 1.1405 + in12 = _mm_adds_epi16(in12, final_rounding); 1.1406 + in13 = _mm_adds_epi16(in13, final_rounding); 1.1407 + in14 = _mm_adds_epi16(in14, final_rounding); 1.1408 + in15 = _mm_adds_epi16(in15, final_rounding); 1.1409 + 1.1410 + in0 = _mm_srai_epi16(in0, 6); 1.1411 + in1 = _mm_srai_epi16(in1, 6); 1.1412 + in2 = _mm_srai_epi16(in2, 6); 1.1413 + in3 = _mm_srai_epi16(in3, 6); 1.1414 + in4 = _mm_srai_epi16(in4, 6); 1.1415 + in5 = _mm_srai_epi16(in5, 6); 1.1416 + in6 = _mm_srai_epi16(in6, 6); 1.1417 + in7 = _mm_srai_epi16(in7, 6); 1.1418 + in8 = _mm_srai_epi16(in8, 6); 1.1419 + in9 = _mm_srai_epi16(in9, 6); 1.1420 + in10 = _mm_srai_epi16(in10, 6); 1.1421 + in11 = _mm_srai_epi16(in11, 6); 1.1422 + in12 = _mm_srai_epi16(in12, 6); 1.1423 + in13 = _mm_srai_epi16(in13, 6); 1.1424 + in14 = _mm_srai_epi16(in14, 6); 1.1425 + in15 = _mm_srai_epi16(in15, 6); 1.1426 + 1.1427 + RECON_AND_STORE(dest, in0); 1.1428 + RECON_AND_STORE(dest, in1); 1.1429 + RECON_AND_STORE(dest, in2); 1.1430 + RECON_AND_STORE(dest, in3); 1.1431 + RECON_AND_STORE(dest, in4); 1.1432 + RECON_AND_STORE(dest, in5); 1.1433 + RECON_AND_STORE(dest, in6); 1.1434 + RECON_AND_STORE(dest, in7); 1.1435 + RECON_AND_STORE(dest, in8); 1.1436 + RECON_AND_STORE(dest, in9); 1.1437 + RECON_AND_STORE(dest, in10); 1.1438 + RECON_AND_STORE(dest, in11); 1.1439 + RECON_AND_STORE(dest, in12); 1.1440 + RECON_AND_STORE(dest, in13); 1.1441 + RECON_AND_STORE(dest, in14); 1.1442 + RECON_AND_STORE(dest, in15); 1.1443 + 1.1444 + dest += 8 - (stride * 16); 1.1445 + } 1.1446 + } 1.1447 +} 1.1448 + 1.1449 +void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 1.1450 + __m128i dc_value; 1.1451 + const __m128i zero = _mm_setzero_si128(); 1.1452 + int a, i; 1.1453 + 1.1454 + a = dct_const_round_shift(input[0] * cospi_16_64); 1.1455 + a = dct_const_round_shift(a * cospi_16_64); 1.1456 + a = ROUND_POWER_OF_TWO(a, 6); 1.1457 + 1.1458 + dc_value = _mm_set1_epi16(a); 1.1459 + 1.1460 + for (i = 0; i < 2; ++i) { 1.1461 + RECON_AND_STORE(dest, dc_value); 1.1462 + RECON_AND_STORE(dest, dc_value); 1.1463 + RECON_AND_STORE(dest, dc_value); 1.1464 + RECON_AND_STORE(dest, dc_value); 1.1465 + RECON_AND_STORE(dest, dc_value); 1.1466 + RECON_AND_STORE(dest, dc_value); 1.1467 + RECON_AND_STORE(dest, dc_value); 1.1468 + RECON_AND_STORE(dest, dc_value); 1.1469 + RECON_AND_STORE(dest, dc_value); 1.1470 + RECON_AND_STORE(dest, dc_value); 1.1471 + RECON_AND_STORE(dest, dc_value); 1.1472 + RECON_AND_STORE(dest, dc_value); 1.1473 + RECON_AND_STORE(dest, dc_value); 1.1474 + RECON_AND_STORE(dest, dc_value); 1.1475 + RECON_AND_STORE(dest, dc_value); 1.1476 + RECON_AND_STORE(dest, dc_value); 1.1477 + dest += 8 - (stride * 16); 1.1478 + } 1.1479 +} 1.1480 + 1.1481 +static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { 1.1482 + __m128i tbuf[8]; 1.1483 + array_transpose_8x8(res0, res0); 1.1484 + array_transpose_8x8(res1, tbuf); 1.1485 + array_transpose_8x8(res0 + 8, res1); 1.1486 + array_transpose_8x8(res1 + 8, res1 + 8); 1.1487 + 1.1488 + res0[8] = tbuf[0]; 1.1489 + res0[9] = tbuf[1]; 1.1490 + res0[10] = tbuf[2]; 1.1491 + res0[11] = tbuf[3]; 1.1492 + res0[12] = tbuf[4]; 1.1493 + res0[13] = tbuf[5]; 1.1494 + res0[14] = tbuf[6]; 1.1495 + res0[15] = tbuf[7]; 1.1496 +} 1.1497 + 1.1498 +static void iadst16_1d_8col(__m128i *in) { 1.1499 + // perform 16x16 1-D ADST for 8 columns 1.1500 + __m128i s[16], x[16], u[32], v[32]; 1.1501 + const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); 1.1502 + const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); 1.1503 + const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); 1.1504 + const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); 1.1505 + const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); 1.1506 + const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); 1.1507 + const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); 1.1508 + const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); 1.1509 + const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); 1.1510 + const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); 1.1511 + const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); 1.1512 + const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); 1.1513 + const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); 1.1514 + const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); 1.1515 + const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); 1.1516 + const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); 1.1517 + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1.1518 + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1.1519 + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1.1520 + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1.1521 + const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); 1.1522 + const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); 1.1523 + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1.1524 + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1.1525 + const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); 1.1526 + const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); 1.1527 + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1.1528 + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.1529 + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1.1530 + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.1531 + const __m128i kZero = _mm_set1_epi16(0); 1.1532 + 1.1533 + u[0] = _mm_unpacklo_epi16(in[15], in[0]); 1.1534 + u[1] = _mm_unpackhi_epi16(in[15], in[0]); 1.1535 + u[2] = _mm_unpacklo_epi16(in[13], in[2]); 1.1536 + u[3] = _mm_unpackhi_epi16(in[13], in[2]); 1.1537 + u[4] = _mm_unpacklo_epi16(in[11], in[4]); 1.1538 + u[5] = _mm_unpackhi_epi16(in[11], in[4]); 1.1539 + u[6] = _mm_unpacklo_epi16(in[9], in[6]); 1.1540 + u[7] = _mm_unpackhi_epi16(in[9], in[6]); 1.1541 + u[8] = _mm_unpacklo_epi16(in[7], in[8]); 1.1542 + u[9] = _mm_unpackhi_epi16(in[7], in[8]); 1.1543 + u[10] = _mm_unpacklo_epi16(in[5], in[10]); 1.1544 + u[11] = _mm_unpackhi_epi16(in[5], in[10]); 1.1545 + u[12] = _mm_unpacklo_epi16(in[3], in[12]); 1.1546 + u[13] = _mm_unpackhi_epi16(in[3], in[12]); 1.1547 + u[14] = _mm_unpacklo_epi16(in[1], in[14]); 1.1548 + u[15] = _mm_unpackhi_epi16(in[1], in[14]); 1.1549 + 1.1550 + v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); 1.1551 + v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); 1.1552 + v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); 1.1553 + v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); 1.1554 + v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); 1.1555 + v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); 1.1556 + v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); 1.1557 + v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); 1.1558 + v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); 1.1559 + v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); 1.1560 + v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); 1.1561 + v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); 1.1562 + v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); 1.1563 + v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); 1.1564 + v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); 1.1565 + v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); 1.1566 + v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); 1.1567 + v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); 1.1568 + v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); 1.1569 + v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); 1.1570 + v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); 1.1571 + v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); 1.1572 + v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); 1.1573 + v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); 1.1574 + v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); 1.1575 + v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); 1.1576 + v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); 1.1577 + v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); 1.1578 + v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); 1.1579 + v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); 1.1580 + v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); 1.1581 + v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); 1.1582 + 1.1583 + u[0] = _mm_add_epi32(v[0], v[16]); 1.1584 + u[1] = _mm_add_epi32(v[1], v[17]); 1.1585 + u[2] = _mm_add_epi32(v[2], v[18]); 1.1586 + u[3] = _mm_add_epi32(v[3], v[19]); 1.1587 + u[4] = _mm_add_epi32(v[4], v[20]); 1.1588 + u[5] = _mm_add_epi32(v[5], v[21]); 1.1589 + u[6] = _mm_add_epi32(v[6], v[22]); 1.1590 + u[7] = _mm_add_epi32(v[7], v[23]); 1.1591 + u[8] = _mm_add_epi32(v[8], v[24]); 1.1592 + u[9] = _mm_add_epi32(v[9], v[25]); 1.1593 + u[10] = _mm_add_epi32(v[10], v[26]); 1.1594 + u[11] = _mm_add_epi32(v[11], v[27]); 1.1595 + u[12] = _mm_add_epi32(v[12], v[28]); 1.1596 + u[13] = _mm_add_epi32(v[13], v[29]); 1.1597 + u[14] = _mm_add_epi32(v[14], v[30]); 1.1598 + u[15] = _mm_add_epi32(v[15], v[31]); 1.1599 + u[16] = _mm_sub_epi32(v[0], v[16]); 1.1600 + u[17] = _mm_sub_epi32(v[1], v[17]); 1.1601 + u[18] = _mm_sub_epi32(v[2], v[18]); 1.1602 + u[19] = _mm_sub_epi32(v[3], v[19]); 1.1603 + u[20] = _mm_sub_epi32(v[4], v[20]); 1.1604 + u[21] = _mm_sub_epi32(v[5], v[21]); 1.1605 + u[22] = _mm_sub_epi32(v[6], v[22]); 1.1606 + u[23] = _mm_sub_epi32(v[7], v[23]); 1.1607 + u[24] = _mm_sub_epi32(v[8], v[24]); 1.1608 + u[25] = _mm_sub_epi32(v[9], v[25]); 1.1609 + u[26] = _mm_sub_epi32(v[10], v[26]); 1.1610 + u[27] = _mm_sub_epi32(v[11], v[27]); 1.1611 + u[28] = _mm_sub_epi32(v[12], v[28]); 1.1612 + u[29] = _mm_sub_epi32(v[13], v[29]); 1.1613 + u[30] = _mm_sub_epi32(v[14], v[30]); 1.1614 + u[31] = _mm_sub_epi32(v[15], v[31]); 1.1615 + 1.1616 + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1.1617 + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1.1618 + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1.1619 + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1.1620 + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1.1621 + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1.1622 + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1.1623 + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1.1624 + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1.1625 + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1.1626 + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1.1627 + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1.1628 + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1.1629 + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1.1630 + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1.1631 + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1.1632 + v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); 1.1633 + v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); 1.1634 + v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); 1.1635 + v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); 1.1636 + v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); 1.1637 + v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); 1.1638 + v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); 1.1639 + v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); 1.1640 + v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); 1.1641 + v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); 1.1642 + v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); 1.1643 + v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); 1.1644 + v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); 1.1645 + v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); 1.1646 + v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); 1.1647 + v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); 1.1648 + 1.1649 + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1.1650 + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1.1651 + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1.1652 + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1.1653 + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1.1654 + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1.1655 + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1.1656 + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1.1657 + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1.1658 + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1.1659 + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1.1660 + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1.1661 + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1.1662 + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1.1663 + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1.1664 + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1.1665 + u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); 1.1666 + u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); 1.1667 + u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); 1.1668 + u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); 1.1669 + u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); 1.1670 + u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); 1.1671 + u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); 1.1672 + u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); 1.1673 + u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); 1.1674 + u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); 1.1675 + u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); 1.1676 + u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); 1.1677 + u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); 1.1678 + u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); 1.1679 + u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); 1.1680 + u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); 1.1681 + 1.1682 + s[0] = _mm_packs_epi32(u[0], u[1]); 1.1683 + s[1] = _mm_packs_epi32(u[2], u[3]); 1.1684 + s[2] = _mm_packs_epi32(u[4], u[5]); 1.1685 + s[3] = _mm_packs_epi32(u[6], u[7]); 1.1686 + s[4] = _mm_packs_epi32(u[8], u[9]); 1.1687 + s[5] = _mm_packs_epi32(u[10], u[11]); 1.1688 + s[6] = _mm_packs_epi32(u[12], u[13]); 1.1689 + s[7] = _mm_packs_epi32(u[14], u[15]); 1.1690 + s[8] = _mm_packs_epi32(u[16], u[17]); 1.1691 + s[9] = _mm_packs_epi32(u[18], u[19]); 1.1692 + s[10] = _mm_packs_epi32(u[20], u[21]); 1.1693 + s[11] = _mm_packs_epi32(u[22], u[23]); 1.1694 + s[12] = _mm_packs_epi32(u[24], u[25]); 1.1695 + s[13] = _mm_packs_epi32(u[26], u[27]); 1.1696 + s[14] = _mm_packs_epi32(u[28], u[29]); 1.1697 + s[15] = _mm_packs_epi32(u[30], u[31]); 1.1698 + 1.1699 + // stage 2 1.1700 + u[0] = _mm_unpacklo_epi16(s[8], s[9]); 1.1701 + u[1] = _mm_unpackhi_epi16(s[8], s[9]); 1.1702 + u[2] = _mm_unpacklo_epi16(s[10], s[11]); 1.1703 + u[3] = _mm_unpackhi_epi16(s[10], s[11]); 1.1704 + u[4] = _mm_unpacklo_epi16(s[12], s[13]); 1.1705 + u[5] = _mm_unpackhi_epi16(s[12], s[13]); 1.1706 + u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1.1707 + u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1.1708 + 1.1709 + v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 1.1710 + v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 1.1711 + v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 1.1712 + v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 1.1713 + v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 1.1714 + v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 1.1715 + v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 1.1716 + v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 1.1717 + v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); 1.1718 + v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); 1.1719 + v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); 1.1720 + v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); 1.1721 + v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); 1.1722 + v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); 1.1723 + v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); 1.1724 + v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); 1.1725 + 1.1726 + u[0] = _mm_add_epi32(v[0], v[8]); 1.1727 + u[1] = _mm_add_epi32(v[1], v[9]); 1.1728 + u[2] = _mm_add_epi32(v[2], v[10]); 1.1729 + u[3] = _mm_add_epi32(v[3], v[11]); 1.1730 + u[4] = _mm_add_epi32(v[4], v[12]); 1.1731 + u[5] = _mm_add_epi32(v[5], v[13]); 1.1732 + u[6] = _mm_add_epi32(v[6], v[14]); 1.1733 + u[7] = _mm_add_epi32(v[7], v[15]); 1.1734 + u[8] = _mm_sub_epi32(v[0], v[8]); 1.1735 + u[9] = _mm_sub_epi32(v[1], v[9]); 1.1736 + u[10] = _mm_sub_epi32(v[2], v[10]); 1.1737 + u[11] = _mm_sub_epi32(v[3], v[11]); 1.1738 + u[12] = _mm_sub_epi32(v[4], v[12]); 1.1739 + u[13] = _mm_sub_epi32(v[5], v[13]); 1.1740 + u[14] = _mm_sub_epi32(v[6], v[14]); 1.1741 + u[15] = _mm_sub_epi32(v[7], v[15]); 1.1742 + 1.1743 + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1.1744 + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1.1745 + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1.1746 + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1.1747 + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1.1748 + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1.1749 + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1.1750 + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1.1751 + v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1.1752 + v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1.1753 + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1.1754 + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1.1755 + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1.1756 + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1.1757 + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1.1758 + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1.1759 + 1.1760 + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1.1761 + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1.1762 + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1.1763 + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1.1764 + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1.1765 + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1.1766 + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1.1767 + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1.1768 + u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); 1.1769 + u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); 1.1770 + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1.1771 + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1.1772 + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1.1773 + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1.1774 + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1.1775 + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1.1776 + 1.1777 + x[0] = _mm_add_epi16(s[0], s[4]); 1.1778 + x[1] = _mm_add_epi16(s[1], s[5]); 1.1779 + x[2] = _mm_add_epi16(s[2], s[6]); 1.1780 + x[3] = _mm_add_epi16(s[3], s[7]); 1.1781 + x[4] = _mm_sub_epi16(s[0], s[4]); 1.1782 + x[5] = _mm_sub_epi16(s[1], s[5]); 1.1783 + x[6] = _mm_sub_epi16(s[2], s[6]); 1.1784 + x[7] = _mm_sub_epi16(s[3], s[7]); 1.1785 + x[8] = _mm_packs_epi32(u[0], u[1]); 1.1786 + x[9] = _mm_packs_epi32(u[2], u[3]); 1.1787 + x[10] = _mm_packs_epi32(u[4], u[5]); 1.1788 + x[11] = _mm_packs_epi32(u[6], u[7]); 1.1789 + x[12] = _mm_packs_epi32(u[8], u[9]); 1.1790 + x[13] = _mm_packs_epi32(u[10], u[11]); 1.1791 + x[14] = _mm_packs_epi32(u[12], u[13]); 1.1792 + x[15] = _mm_packs_epi32(u[14], u[15]); 1.1793 + 1.1794 + // stage 3 1.1795 + u[0] = _mm_unpacklo_epi16(x[4], x[5]); 1.1796 + u[1] = _mm_unpackhi_epi16(x[4], x[5]); 1.1797 + u[2] = _mm_unpacklo_epi16(x[6], x[7]); 1.1798 + u[3] = _mm_unpackhi_epi16(x[6], x[7]); 1.1799 + u[4] = _mm_unpacklo_epi16(x[12], x[13]); 1.1800 + u[5] = _mm_unpackhi_epi16(x[12], x[13]); 1.1801 + u[6] = _mm_unpacklo_epi16(x[14], x[15]); 1.1802 + u[7] = _mm_unpackhi_epi16(x[14], x[15]); 1.1803 + 1.1804 + v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); 1.1805 + v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); 1.1806 + v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); 1.1807 + v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); 1.1808 + v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); 1.1809 + v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); 1.1810 + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 1.1811 + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 1.1812 + v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); 1.1813 + v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); 1.1814 + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); 1.1815 + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); 1.1816 + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); 1.1817 + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); 1.1818 + v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); 1.1819 + v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); 1.1820 + 1.1821 + u[0] = _mm_add_epi32(v[0], v[4]); 1.1822 + u[1] = _mm_add_epi32(v[1], v[5]); 1.1823 + u[2] = _mm_add_epi32(v[2], v[6]); 1.1824 + u[3] = _mm_add_epi32(v[3], v[7]); 1.1825 + u[4] = _mm_sub_epi32(v[0], v[4]); 1.1826 + u[5] = _mm_sub_epi32(v[1], v[5]); 1.1827 + u[6] = _mm_sub_epi32(v[2], v[6]); 1.1828 + u[7] = _mm_sub_epi32(v[3], v[7]); 1.1829 + u[8] = _mm_add_epi32(v[8], v[12]); 1.1830 + u[9] = _mm_add_epi32(v[9], v[13]); 1.1831 + u[10] = _mm_add_epi32(v[10], v[14]); 1.1832 + u[11] = _mm_add_epi32(v[11], v[15]); 1.1833 + u[12] = _mm_sub_epi32(v[8], v[12]); 1.1834 + u[13] = _mm_sub_epi32(v[9], v[13]); 1.1835 + u[14] = _mm_sub_epi32(v[10], v[14]); 1.1836 + u[15] = _mm_sub_epi32(v[11], v[15]); 1.1837 + 1.1838 + u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1.1839 + u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1.1840 + u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1.1841 + u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1.1842 + u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1.1843 + u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1.1844 + u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1.1845 + u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1.1846 + u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); 1.1847 + u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); 1.1848 + u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1.1849 + u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1.1850 + u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1.1851 + u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1.1852 + u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1.1853 + u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1.1854 + 1.1855 + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1.1856 + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1.1857 + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1.1858 + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1.1859 + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1.1860 + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1.1861 + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1.1862 + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1.1863 + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1.1864 + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1.1865 + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1.1866 + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1.1867 + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1.1868 + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1.1869 + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1.1870 + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1.1871 + 1.1872 + s[0] = _mm_add_epi16(x[0], x[2]); 1.1873 + s[1] = _mm_add_epi16(x[1], x[3]); 1.1874 + s[2] = _mm_sub_epi16(x[0], x[2]); 1.1875 + s[3] = _mm_sub_epi16(x[1], x[3]); 1.1876 + s[4] = _mm_packs_epi32(v[0], v[1]); 1.1877 + s[5] = _mm_packs_epi32(v[2], v[3]); 1.1878 + s[6] = _mm_packs_epi32(v[4], v[5]); 1.1879 + s[7] = _mm_packs_epi32(v[6], v[7]); 1.1880 + s[8] = _mm_add_epi16(x[8], x[10]); 1.1881 + s[9] = _mm_add_epi16(x[9], x[11]); 1.1882 + s[10] = _mm_sub_epi16(x[8], x[10]); 1.1883 + s[11] = _mm_sub_epi16(x[9], x[11]); 1.1884 + s[12] = _mm_packs_epi32(v[8], v[9]); 1.1885 + s[13] = _mm_packs_epi32(v[10], v[11]); 1.1886 + s[14] = _mm_packs_epi32(v[12], v[13]); 1.1887 + s[15] = _mm_packs_epi32(v[14], v[15]); 1.1888 + 1.1889 + // stage 4 1.1890 + u[0] = _mm_unpacklo_epi16(s[2], s[3]); 1.1891 + u[1] = _mm_unpackhi_epi16(s[2], s[3]); 1.1892 + u[2] = _mm_unpacklo_epi16(s[6], s[7]); 1.1893 + u[3] = _mm_unpackhi_epi16(s[6], s[7]); 1.1894 + u[4] = _mm_unpacklo_epi16(s[10], s[11]); 1.1895 + u[5] = _mm_unpackhi_epi16(s[10], s[11]); 1.1896 + u[6] = _mm_unpacklo_epi16(s[14], s[15]); 1.1897 + u[7] = _mm_unpackhi_epi16(s[14], s[15]); 1.1898 + 1.1899 + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); 1.1900 + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); 1.1901 + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 1.1902 + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 1.1903 + v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1.1904 + v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 1.1905 + v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 1.1906 + v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 1.1907 + v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); 1.1908 + v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); 1.1909 + v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); 1.1910 + v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); 1.1911 + v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); 1.1912 + v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); 1.1913 + v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); 1.1914 + v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); 1.1915 + 1.1916 + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1.1917 + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1.1918 + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1.1919 + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1.1920 + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1.1921 + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1.1922 + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1.1923 + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1.1924 + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1.1925 + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1.1926 + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1.1927 + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1.1928 + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1.1929 + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1.1930 + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1.1931 + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1.1932 + 1.1933 + v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1.1934 + v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1.1935 + v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1.1936 + v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1.1937 + v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1.1938 + v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1.1939 + v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1.1940 + v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1.1941 + v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1.1942 + v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1.1943 + v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1.1944 + v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1.1945 + v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1.1946 + v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1.1947 + v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1.1948 + v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1.1949 + 1.1950 + in[0] = s[0]; 1.1951 + in[1] = _mm_sub_epi16(kZero, s[8]); 1.1952 + in[2] = s[12]; 1.1953 + in[3] = _mm_sub_epi16(kZero, s[4]); 1.1954 + in[4] = _mm_packs_epi32(v[4], v[5]); 1.1955 + in[5] = _mm_packs_epi32(v[12], v[13]); 1.1956 + in[6] = _mm_packs_epi32(v[8], v[9]); 1.1957 + in[7] = _mm_packs_epi32(v[0], v[1]); 1.1958 + in[8] = _mm_packs_epi32(v[2], v[3]); 1.1959 + in[9] = _mm_packs_epi32(v[10], v[11]); 1.1960 + in[10] = _mm_packs_epi32(v[14], v[15]); 1.1961 + in[11] = _mm_packs_epi32(v[6], v[7]); 1.1962 + in[12] = s[5]; 1.1963 + in[13] = _mm_sub_epi16(kZero, s[13]); 1.1964 + in[14] = s[9]; 1.1965 + in[15] = _mm_sub_epi16(kZero, s[1]); 1.1966 +} 1.1967 + 1.1968 +static void idct16_1d_8col(__m128i *in) { 1.1969 + const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1.1970 + const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); 1.1971 + const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1.1972 + const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); 1.1973 + const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1.1974 + const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); 1.1975 + const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1.1976 + const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); 1.1977 + const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1.1978 + const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); 1.1979 + const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1.1980 + const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); 1.1981 + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); 1.1982 + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.1983 + const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1.1984 + const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); 1.1985 + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1.1986 + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); 1.1987 + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1.1988 + const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1.1989 + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.1990 + __m128i v[16], u[16], s[16], t[16]; 1.1991 + 1.1992 + // stage 1 1.1993 + s[0] = in[0]; 1.1994 + s[1] = in[8]; 1.1995 + s[2] = in[4]; 1.1996 + s[3] = in[12]; 1.1997 + s[4] = in[2]; 1.1998 + s[5] = in[10]; 1.1999 + s[6] = in[6]; 1.2000 + s[7] = in[14]; 1.2001 + s[8] = in[1]; 1.2002 + s[9] = in[9]; 1.2003 + s[10] = in[5]; 1.2004 + s[11] = in[13]; 1.2005 + s[12] = in[3]; 1.2006 + s[13] = in[11]; 1.2007 + s[14] = in[7]; 1.2008 + s[15] = in[15]; 1.2009 + 1.2010 + // stage 2 1.2011 + u[0] = _mm_unpacklo_epi16(s[8], s[15]); 1.2012 + u[1] = _mm_unpackhi_epi16(s[8], s[15]); 1.2013 + u[2] = _mm_unpacklo_epi16(s[9], s[14]); 1.2014 + u[3] = _mm_unpackhi_epi16(s[9], s[14]); 1.2015 + u[4] = _mm_unpacklo_epi16(s[10], s[13]); 1.2016 + u[5] = _mm_unpackhi_epi16(s[10], s[13]); 1.2017 + u[6] = _mm_unpacklo_epi16(s[11], s[12]); 1.2018 + u[7] = _mm_unpackhi_epi16(s[11], s[12]); 1.2019 + 1.2020 + v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); 1.2021 + v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); 1.2022 + v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); 1.2023 + v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); 1.2024 + v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); 1.2025 + v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); 1.2026 + v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); 1.2027 + v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); 1.2028 + v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); 1.2029 + v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); 1.2030 + v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); 1.2031 + v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); 1.2032 + v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); 1.2033 + v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); 1.2034 + v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); 1.2035 + v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); 1.2036 + 1.2037 + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1.2038 + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1.2039 + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1.2040 + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1.2041 + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1.2042 + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1.2043 + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1.2044 + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1.2045 + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1.2046 + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1.2047 + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1.2048 + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1.2049 + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1.2050 + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1.2051 + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1.2052 + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1.2053 + 1.2054 + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1.2055 + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1.2056 + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1.2057 + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1.2058 + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1.2059 + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1.2060 + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1.2061 + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1.2062 + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1.2063 + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1.2064 + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1.2065 + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1.2066 + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1.2067 + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1.2068 + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1.2069 + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1.2070 + 1.2071 + s[8] = _mm_packs_epi32(u[0], u[1]); 1.2072 + s[15] = _mm_packs_epi32(u[2], u[3]); 1.2073 + s[9] = _mm_packs_epi32(u[4], u[5]); 1.2074 + s[14] = _mm_packs_epi32(u[6], u[7]); 1.2075 + s[10] = _mm_packs_epi32(u[8], u[9]); 1.2076 + s[13] = _mm_packs_epi32(u[10], u[11]); 1.2077 + s[11] = _mm_packs_epi32(u[12], u[13]); 1.2078 + s[12] = _mm_packs_epi32(u[14], u[15]); 1.2079 + 1.2080 + // stage 3 1.2081 + t[0] = s[0]; 1.2082 + t[1] = s[1]; 1.2083 + t[2] = s[2]; 1.2084 + t[3] = s[3]; 1.2085 + u[0] = _mm_unpacklo_epi16(s[4], s[7]); 1.2086 + u[1] = _mm_unpackhi_epi16(s[4], s[7]); 1.2087 + u[2] = _mm_unpacklo_epi16(s[5], s[6]); 1.2088 + u[3] = _mm_unpackhi_epi16(s[5], s[6]); 1.2089 + 1.2090 + v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); 1.2091 + v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); 1.2092 + v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); 1.2093 + v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); 1.2094 + v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); 1.2095 + v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); 1.2096 + v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); 1.2097 + v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); 1.2098 + 1.2099 + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1.2100 + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1.2101 + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1.2102 + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1.2103 + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1.2104 + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1.2105 + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1.2106 + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1.2107 + 1.2108 + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1.2109 + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1.2110 + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1.2111 + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1.2112 + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1.2113 + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1.2114 + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1.2115 + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1.2116 + 1.2117 + t[4] = _mm_packs_epi32(u[0], u[1]); 1.2118 + t[7] = _mm_packs_epi32(u[2], u[3]); 1.2119 + t[5] = _mm_packs_epi32(u[4], u[5]); 1.2120 + t[6] = _mm_packs_epi32(u[6], u[7]); 1.2121 + t[8] = _mm_add_epi16(s[8], s[9]); 1.2122 + t[9] = _mm_sub_epi16(s[8], s[9]); 1.2123 + t[10] = _mm_sub_epi16(s[11], s[10]); 1.2124 + t[11] = _mm_add_epi16(s[10], s[11]); 1.2125 + t[12] = _mm_add_epi16(s[12], s[13]); 1.2126 + t[13] = _mm_sub_epi16(s[12], s[13]); 1.2127 + t[14] = _mm_sub_epi16(s[15], s[14]); 1.2128 + t[15] = _mm_add_epi16(s[14], s[15]); 1.2129 + 1.2130 + // stage 4 1.2131 + u[0] = _mm_unpacklo_epi16(t[0], t[1]); 1.2132 + u[1] = _mm_unpackhi_epi16(t[0], t[1]); 1.2133 + u[2] = _mm_unpacklo_epi16(t[2], t[3]); 1.2134 + u[3] = _mm_unpackhi_epi16(t[2], t[3]); 1.2135 + u[4] = _mm_unpacklo_epi16(t[9], t[14]); 1.2136 + u[5] = _mm_unpackhi_epi16(t[9], t[14]); 1.2137 + u[6] = _mm_unpacklo_epi16(t[10], t[13]); 1.2138 + u[7] = _mm_unpackhi_epi16(t[10], t[13]); 1.2139 + 1.2140 + v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1.2141 + v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1.2142 + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); 1.2143 + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); 1.2144 + v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); 1.2145 + v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); 1.2146 + v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); 1.2147 + v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); 1.2148 + v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); 1.2149 + v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); 1.2150 + v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); 1.2151 + v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); 1.2152 + v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); 1.2153 + v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); 1.2154 + v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); 1.2155 + v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); 1.2156 + 1.2157 + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1.2158 + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1.2159 + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1.2160 + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1.2161 + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1.2162 + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1.2163 + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1.2164 + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1.2165 + u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); 1.2166 + u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); 1.2167 + u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); 1.2168 + u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); 1.2169 + u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); 1.2170 + u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); 1.2171 + u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); 1.2172 + u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); 1.2173 + 1.2174 + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1.2175 + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1.2176 + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1.2177 + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1.2178 + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1.2179 + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1.2180 + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1.2181 + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1.2182 + u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); 1.2183 + u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); 1.2184 + u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); 1.2185 + u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); 1.2186 + u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); 1.2187 + u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); 1.2188 + u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); 1.2189 + u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); 1.2190 + 1.2191 + s[0] = _mm_packs_epi32(u[0], u[1]); 1.2192 + s[1] = _mm_packs_epi32(u[2], u[3]); 1.2193 + s[2] = _mm_packs_epi32(u[4], u[5]); 1.2194 + s[3] = _mm_packs_epi32(u[6], u[7]); 1.2195 + s[4] = _mm_add_epi16(t[4], t[5]); 1.2196 + s[5] = _mm_sub_epi16(t[4], t[5]); 1.2197 + s[6] = _mm_sub_epi16(t[7], t[6]); 1.2198 + s[7] = _mm_add_epi16(t[6], t[7]); 1.2199 + s[8] = t[8]; 1.2200 + s[15] = t[15]; 1.2201 + s[9] = _mm_packs_epi32(u[8], u[9]); 1.2202 + s[14] = _mm_packs_epi32(u[10], u[11]); 1.2203 + s[10] = _mm_packs_epi32(u[12], u[13]); 1.2204 + s[13] = _mm_packs_epi32(u[14], u[15]); 1.2205 + s[11] = t[11]; 1.2206 + s[12] = t[12]; 1.2207 + 1.2208 + // stage 5 1.2209 + t[0] = _mm_add_epi16(s[0], s[3]); 1.2210 + t[1] = _mm_add_epi16(s[1], s[2]); 1.2211 + t[2] = _mm_sub_epi16(s[1], s[2]); 1.2212 + t[3] = _mm_sub_epi16(s[0], s[3]); 1.2213 + t[4] = s[4]; 1.2214 + t[7] = s[7]; 1.2215 + 1.2216 + u[0] = _mm_unpacklo_epi16(s[5], s[6]); 1.2217 + u[1] = _mm_unpackhi_epi16(s[5], s[6]); 1.2218 + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 1.2219 + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 1.2220 + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1.2221 + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1.2222 + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1.2223 + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1.2224 + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1.2225 + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1.2226 + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1.2227 + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1.2228 + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1.2229 + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1.2230 + t[5] = _mm_packs_epi32(u[0], u[1]); 1.2231 + t[6] = _mm_packs_epi32(u[2], u[3]); 1.2232 + 1.2233 + t[8] = _mm_add_epi16(s[8], s[11]); 1.2234 + t[9] = _mm_add_epi16(s[9], s[10]); 1.2235 + t[10] = _mm_sub_epi16(s[9], s[10]); 1.2236 + t[11] = _mm_sub_epi16(s[8], s[11]); 1.2237 + t[12] = _mm_sub_epi16(s[15], s[12]); 1.2238 + t[13] = _mm_sub_epi16(s[14], s[13]); 1.2239 + t[14] = _mm_add_epi16(s[13], s[14]); 1.2240 + t[15] = _mm_add_epi16(s[12], s[15]); 1.2241 + 1.2242 + // stage 6 1.2243 + s[0] = _mm_add_epi16(t[0], t[7]); 1.2244 + s[1] = _mm_add_epi16(t[1], t[6]); 1.2245 + s[2] = _mm_add_epi16(t[2], t[5]); 1.2246 + s[3] = _mm_add_epi16(t[3], t[4]); 1.2247 + s[4] = _mm_sub_epi16(t[3], t[4]); 1.2248 + s[5] = _mm_sub_epi16(t[2], t[5]); 1.2249 + s[6] = _mm_sub_epi16(t[1], t[6]); 1.2250 + s[7] = _mm_sub_epi16(t[0], t[7]); 1.2251 + s[8] = t[8]; 1.2252 + s[9] = t[9]; 1.2253 + 1.2254 + u[0] = _mm_unpacklo_epi16(t[10], t[13]); 1.2255 + u[1] = _mm_unpackhi_epi16(t[10], t[13]); 1.2256 + u[2] = _mm_unpacklo_epi16(t[11], t[12]); 1.2257 + u[3] = _mm_unpackhi_epi16(t[11], t[12]); 1.2258 + 1.2259 + v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); 1.2260 + v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); 1.2261 + v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); 1.2262 + v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); 1.2263 + v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); 1.2264 + v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); 1.2265 + v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); 1.2266 + v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); 1.2267 + 1.2268 + u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); 1.2269 + u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); 1.2270 + u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); 1.2271 + u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); 1.2272 + u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); 1.2273 + u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); 1.2274 + u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); 1.2275 + u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); 1.2276 + 1.2277 + u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1.2278 + u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1.2279 + u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1.2280 + u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1.2281 + u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1.2282 + u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1.2283 + u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1.2284 + u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1.2285 + 1.2286 + s[10] = _mm_packs_epi32(u[0], u[1]); 1.2287 + s[13] = _mm_packs_epi32(u[2], u[3]); 1.2288 + s[11] = _mm_packs_epi32(u[4], u[5]); 1.2289 + s[12] = _mm_packs_epi32(u[6], u[7]); 1.2290 + s[14] = t[14]; 1.2291 + s[15] = t[15]; 1.2292 + 1.2293 + // stage 7 1.2294 + in[0] = _mm_add_epi16(s[0], s[15]); 1.2295 + in[1] = _mm_add_epi16(s[1], s[14]); 1.2296 + in[2] = _mm_add_epi16(s[2], s[13]); 1.2297 + in[3] = _mm_add_epi16(s[3], s[12]); 1.2298 + in[4] = _mm_add_epi16(s[4], s[11]); 1.2299 + in[5] = _mm_add_epi16(s[5], s[10]); 1.2300 + in[6] = _mm_add_epi16(s[6], s[9]); 1.2301 + in[7] = _mm_add_epi16(s[7], s[8]); 1.2302 + in[8] = _mm_sub_epi16(s[7], s[8]); 1.2303 + in[9] = _mm_sub_epi16(s[6], s[9]); 1.2304 + in[10] = _mm_sub_epi16(s[5], s[10]); 1.2305 + in[11] = _mm_sub_epi16(s[4], s[11]); 1.2306 + in[12] = _mm_sub_epi16(s[3], s[12]); 1.2307 + in[13] = _mm_sub_epi16(s[2], s[13]); 1.2308 + in[14] = _mm_sub_epi16(s[1], s[14]); 1.2309 + in[15] = _mm_sub_epi16(s[0], s[15]); 1.2310 +} 1.2311 + 1.2312 +static void idct16_1d_sse2(__m128i *in0, __m128i *in1) { 1.2313 + array_transpose_16x16(in0, in1); 1.2314 + idct16_1d_8col(in0); 1.2315 + idct16_1d_8col(in1); 1.2316 +} 1.2317 + 1.2318 +static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) { 1.2319 + array_transpose_16x16(in0, in1); 1.2320 + iadst16_1d_8col(in0); 1.2321 + iadst16_1d_8col(in1); 1.2322 +} 1.2323 + 1.2324 +static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { 1.2325 + in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); 1.2326 + in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); 1.2327 + in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); 1.2328 + in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); 1.2329 + in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); 1.2330 + in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); 1.2331 + in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); 1.2332 + in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); 1.2333 + 1.2334 + in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); 1.2335 + in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); 1.2336 + in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); 1.2337 + in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); 1.2338 + in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); 1.2339 + in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); 1.2340 + in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); 1.2341 + in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); 1.2342 +} 1.2343 + 1.2344 +static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { 1.2345 + const __m128i final_rounding = _mm_set1_epi16(1<<5); 1.2346 + const __m128i zero = _mm_setzero_si128(); 1.2347 + // Final rounding and shift 1.2348 + in[0] = _mm_adds_epi16(in[0], final_rounding); 1.2349 + in[1] = _mm_adds_epi16(in[1], final_rounding); 1.2350 + in[2] = _mm_adds_epi16(in[2], final_rounding); 1.2351 + in[3] = _mm_adds_epi16(in[3], final_rounding); 1.2352 + in[4] = _mm_adds_epi16(in[4], final_rounding); 1.2353 + in[5] = _mm_adds_epi16(in[5], final_rounding); 1.2354 + in[6] = _mm_adds_epi16(in[6], final_rounding); 1.2355 + in[7] = _mm_adds_epi16(in[7], final_rounding); 1.2356 + in[8] = _mm_adds_epi16(in[8], final_rounding); 1.2357 + in[9] = _mm_adds_epi16(in[9], final_rounding); 1.2358 + in[10] = _mm_adds_epi16(in[10], final_rounding); 1.2359 + in[11] = _mm_adds_epi16(in[11], final_rounding); 1.2360 + in[12] = _mm_adds_epi16(in[12], final_rounding); 1.2361 + in[13] = _mm_adds_epi16(in[13], final_rounding); 1.2362 + in[14] = _mm_adds_epi16(in[14], final_rounding); 1.2363 + in[15] = _mm_adds_epi16(in[15], final_rounding); 1.2364 + 1.2365 + in[0] = _mm_srai_epi16(in[0], 6); 1.2366 + in[1] = _mm_srai_epi16(in[1], 6); 1.2367 + in[2] = _mm_srai_epi16(in[2], 6); 1.2368 + in[3] = _mm_srai_epi16(in[3], 6); 1.2369 + in[4] = _mm_srai_epi16(in[4], 6); 1.2370 + in[5] = _mm_srai_epi16(in[5], 6); 1.2371 + in[6] = _mm_srai_epi16(in[6], 6); 1.2372 + in[7] = _mm_srai_epi16(in[7], 6); 1.2373 + in[8] = _mm_srai_epi16(in[8], 6); 1.2374 + in[9] = _mm_srai_epi16(in[9], 6); 1.2375 + in[10] = _mm_srai_epi16(in[10], 6); 1.2376 + in[11] = _mm_srai_epi16(in[11], 6); 1.2377 + in[12] = _mm_srai_epi16(in[12], 6); 1.2378 + in[13] = _mm_srai_epi16(in[13], 6); 1.2379 + in[14] = _mm_srai_epi16(in[14], 6); 1.2380 + in[15] = _mm_srai_epi16(in[15], 6); 1.2381 + 1.2382 + RECON_AND_STORE(dest, in[0]); 1.2383 + RECON_AND_STORE(dest, in[1]); 1.2384 + RECON_AND_STORE(dest, in[2]); 1.2385 + RECON_AND_STORE(dest, in[3]); 1.2386 + RECON_AND_STORE(dest, in[4]); 1.2387 + RECON_AND_STORE(dest, in[5]); 1.2388 + RECON_AND_STORE(dest, in[6]); 1.2389 + RECON_AND_STORE(dest, in[7]); 1.2390 + RECON_AND_STORE(dest, in[8]); 1.2391 + RECON_AND_STORE(dest, in[9]); 1.2392 + RECON_AND_STORE(dest, in[10]); 1.2393 + RECON_AND_STORE(dest, in[11]); 1.2394 + RECON_AND_STORE(dest, in[12]); 1.2395 + RECON_AND_STORE(dest, in[13]); 1.2396 + RECON_AND_STORE(dest, in[14]); 1.2397 + RECON_AND_STORE(dest, in[15]); 1.2398 +} 1.2399 + 1.2400 +void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, 1.2401 + int tx_type) { 1.2402 + __m128i in0[16], in1[16]; 1.2403 + 1.2404 + load_buffer_8x16(input, in0); 1.2405 + input += 8; 1.2406 + load_buffer_8x16(input, in1); 1.2407 + 1.2408 + switch (tx_type) { 1.2409 + case 0: // DCT_DCT 1.2410 + idct16_1d_sse2(in0, in1); 1.2411 + idct16_1d_sse2(in0, in1); 1.2412 + break; 1.2413 + case 1: // ADST_DCT 1.2414 + idct16_1d_sse2(in0, in1); 1.2415 + iadst16_1d_sse2(in0, in1); 1.2416 + break; 1.2417 + case 2: // DCT_ADST 1.2418 + iadst16_1d_sse2(in0, in1); 1.2419 + idct16_1d_sse2(in0, in1); 1.2420 + break; 1.2421 + case 3: // ADST_ADST 1.2422 + iadst16_1d_sse2(in0, in1); 1.2423 + iadst16_1d_sse2(in0, in1); 1.2424 + break; 1.2425 + default: 1.2426 + assert(0); 1.2427 + break; 1.2428 + } 1.2429 + 1.2430 + write_buffer_8x16(dest, in0, stride); 1.2431 + dest += 8; 1.2432 + write_buffer_8x16(dest, in1, stride); 1.2433 +} 1.2434 + 1.2435 +void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, 1.2436 + int stride) { 1.2437 + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.2438 + const __m128i final_rounding = _mm_set1_epi16(1<<5); 1.2439 + const __m128i zero = _mm_setzero_si128(); 1.2440 + 1.2441 + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1.2442 + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 1.2443 + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1.2444 + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 1.2445 + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1.2446 + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 1.2447 + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1.2448 + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 1.2449 + 1.2450 + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1.2451 + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1.2452 + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1.2453 + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 1.2454 + 1.2455 + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1.2456 + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.2457 + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1.2458 + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1.2459 + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1.2460 + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 1.2461 + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1.2462 + const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1.2463 + 1.2464 + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1.2465 + 1.2466 + __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, 1.2467 + in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, 1.2468 + in10 = zero, in11 = zero, in12 = zero, in13 = zero, 1.2469 + in14 = zero, in15 = zero; 1.2470 + __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, 1.2471 + l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, 1.2472 + l12 = zero, l13 = zero, l14 = zero, l15 = zero; 1.2473 + 1.2474 + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 1.2475 + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 1.2476 + stp1_8_0, stp1_12_0; 1.2477 + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 1.2478 + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; 1.2479 + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1.2480 + int i; 1.2481 + // 1-D idct. Load input data. 1.2482 + in0 = _mm_load_si128((const __m128i *)input); 1.2483 + in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); 1.2484 + in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); 1.2485 + in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); 1.2486 + in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); 1.2487 + in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); 1.2488 + in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); 1.2489 + in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); 1.2490 + 1.2491 + TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3); 1.2492 + TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11); 1.2493 + 1.2494 + // Stage2 1.2495 + { 1.2496 + const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11); 1.2497 + const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3); 1.2498 + const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9); 1.2499 + const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1); 1.2500 + 1.2501 + tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); 1.2502 + tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); 1.2503 + tmp4 = _mm_madd_epi16(lo_9_7, stg2_2); 1.2504 + tmp6 = _mm_madd_epi16(lo_9_7, stg2_3); 1.2505 + tmp1 = _mm_madd_epi16(lo_5_11, stg2_4); 1.2506 + tmp3 = _mm_madd_epi16(lo_5_11, stg2_5); 1.2507 + tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); 1.2508 + tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); 1.2509 + 1.2510 + tmp0 = _mm_add_epi32(tmp0, rounding); 1.2511 + tmp2 = _mm_add_epi32(tmp2, rounding); 1.2512 + tmp4 = _mm_add_epi32(tmp4, rounding); 1.2513 + tmp6 = _mm_add_epi32(tmp6, rounding); 1.2514 + tmp1 = _mm_add_epi32(tmp1, rounding); 1.2515 + tmp3 = _mm_add_epi32(tmp3, rounding); 1.2516 + tmp5 = _mm_add_epi32(tmp5, rounding); 1.2517 + tmp7 = _mm_add_epi32(tmp7, rounding); 1.2518 + 1.2519 + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1.2520 + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1.2521 + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1.2522 + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1.2523 + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 1.2524 + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 1.2525 + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 1.2526 + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 1.2527 + 1.2528 + stp2_8 = _mm_packs_epi32(tmp0, zero); 1.2529 + stp2_15 = _mm_packs_epi32(tmp2, zero); 1.2530 + stp2_9 = _mm_packs_epi32(tmp4, zero); 1.2531 + stp2_14 = _mm_packs_epi32(tmp6, zero); 1.2532 + 1.2533 + stp2_10 = _mm_packs_epi32(tmp1, zero); 1.2534 + stp2_13 = _mm_packs_epi32(tmp3, zero); 1.2535 + stp2_11 = _mm_packs_epi32(tmp5, zero); 1.2536 + stp2_12 = _mm_packs_epi32(tmp7, zero); 1.2537 + } 1.2538 + 1.2539 + // Stage3 1.2540 + { 1.2541 + const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11); 1.2542 + const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3); 1.2543 + 1.2544 + tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); 1.2545 + tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); 1.2546 + tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); 1.2547 + tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); 1.2548 + 1.2549 + tmp0 = _mm_add_epi32(tmp0, rounding); 1.2550 + tmp2 = _mm_add_epi32(tmp2, rounding); 1.2551 + tmp4 = _mm_add_epi32(tmp4, rounding); 1.2552 + tmp6 = _mm_add_epi32(tmp6, rounding); 1.2553 + 1.2554 + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1.2555 + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1.2556 + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1.2557 + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1.2558 + 1.2559 + stp1_4 = _mm_packs_epi32(tmp0, zero); 1.2560 + stp1_7 = _mm_packs_epi32(tmp2, zero); 1.2561 + stp1_5 = _mm_packs_epi32(tmp4, zero); 1.2562 + stp1_6 = _mm_packs_epi32(tmp6, zero); 1.2563 + 1.2564 + stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); 1.2565 + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); 1.2566 + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); 1.2567 + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); 1.2568 + 1.2569 + stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); 1.2570 + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); 1.2571 + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); 1.2572 + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); 1.2573 + } 1.2574 + 1.2575 + // Stage4 1.2576 + { 1.2577 + const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); 1.2578 + const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10); 1.2579 + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); 1.2580 + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 1.2581 + 1.2582 + tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); 1.2583 + tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); 1.2584 + tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); 1.2585 + tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); 1.2586 + tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); 1.2587 + tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); 1.2588 + tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); 1.2589 + tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); 1.2590 + 1.2591 + tmp0 = _mm_add_epi32(tmp0, rounding); 1.2592 + tmp2 = _mm_add_epi32(tmp2, rounding); 1.2593 + tmp4 = _mm_add_epi32(tmp4, rounding); 1.2594 + tmp6 = _mm_add_epi32(tmp6, rounding); 1.2595 + tmp1 = _mm_add_epi32(tmp1, rounding); 1.2596 + tmp3 = _mm_add_epi32(tmp3, rounding); 1.2597 + tmp5 = _mm_add_epi32(tmp5, rounding); 1.2598 + tmp7 = _mm_add_epi32(tmp7, rounding); 1.2599 + 1.2600 + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1.2601 + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1.2602 + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1.2603 + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1.2604 + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 1.2605 + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 1.2606 + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); 1.2607 + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); 1.2608 + 1.2609 + stp2_0 = _mm_packs_epi32(tmp0, zero); 1.2610 + stp2_1 = _mm_packs_epi32(tmp2, zero); 1.2611 + stp2_2 = _mm_packs_epi32(tmp4, zero); 1.2612 + stp2_3 = _mm_packs_epi32(tmp6, zero); 1.2613 + stp2_9 = _mm_packs_epi32(tmp1, zero); 1.2614 + stp2_14 = _mm_packs_epi32(tmp3, zero); 1.2615 + stp2_10 = _mm_packs_epi32(tmp5, zero); 1.2616 + stp2_13 = _mm_packs_epi32(tmp7, zero); 1.2617 + 1.2618 + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); 1.2619 + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); 1.2620 + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); 1.2621 + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); 1.2622 + } 1.2623 + 1.2624 + // Stage5 and Stage6 1.2625 + { 1.2626 + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); 1.2627 + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); 1.2628 + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); 1.2629 + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); 1.2630 + 1.2631 + stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); 1.2632 + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); 1.2633 + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); 1.2634 + stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); 1.2635 + 1.2636 + stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); 1.2637 + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); 1.2638 + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); 1.2639 + stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); 1.2640 + } 1.2641 + 1.2642 + // Stage6 1.2643 + { 1.2644 + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); 1.2645 + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); 1.2646 + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); 1.2647 + 1.2648 + tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); 1.2649 + tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); 1.2650 + tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); 1.2651 + tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); 1.2652 + tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); 1.2653 + tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); 1.2654 + 1.2655 + tmp1 = _mm_add_epi32(tmp1, rounding); 1.2656 + tmp3 = _mm_add_epi32(tmp3, rounding); 1.2657 + tmp0 = _mm_add_epi32(tmp0, rounding); 1.2658 + tmp2 = _mm_add_epi32(tmp2, rounding); 1.2659 + tmp4 = _mm_add_epi32(tmp4, rounding); 1.2660 + tmp6 = _mm_add_epi32(tmp6, rounding); 1.2661 + 1.2662 + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); 1.2663 + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); 1.2664 + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); 1.2665 + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); 1.2666 + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); 1.2667 + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); 1.2668 + 1.2669 + stp1_5 = _mm_packs_epi32(tmp1, zero); 1.2670 + stp1_6 = _mm_packs_epi32(tmp3, zero); 1.2671 + stp2_10 = _mm_packs_epi32(tmp0, zero); 1.2672 + stp2_13 = _mm_packs_epi32(tmp2, zero); 1.2673 + stp2_11 = _mm_packs_epi32(tmp4, zero); 1.2674 + stp2_12 = _mm_packs_epi32(tmp6, zero); 1.2675 + 1.2676 + stp2_0 = _mm_add_epi16(stp1_0, stp2_7); 1.2677 + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); 1.2678 + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); 1.2679 + stp2_3 = _mm_add_epi16(stp1_3, stp2_4); 1.2680 + stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); 1.2681 + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); 1.2682 + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); 1.2683 + stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); 1.2684 + } 1.2685 + 1.2686 + // Stage7. Left 8x16 only. 1.2687 + l0 = _mm_add_epi16(stp2_0, stp1_15); 1.2688 + l1 = _mm_add_epi16(stp2_1, stp1_14); 1.2689 + l2 = _mm_add_epi16(stp2_2, stp2_13); 1.2690 + l3 = _mm_add_epi16(stp2_3, stp2_12); 1.2691 + l4 = _mm_add_epi16(stp2_4, stp2_11); 1.2692 + l5 = _mm_add_epi16(stp2_5, stp2_10); 1.2693 + l6 = _mm_add_epi16(stp2_6, stp1_9); 1.2694 + l7 = _mm_add_epi16(stp2_7, stp1_8); 1.2695 + l8 = _mm_sub_epi16(stp2_7, stp1_8); 1.2696 + l9 = _mm_sub_epi16(stp2_6, stp1_9); 1.2697 + l10 = _mm_sub_epi16(stp2_5, stp2_10); 1.2698 + l11 = _mm_sub_epi16(stp2_4, stp2_11); 1.2699 + l12 = _mm_sub_epi16(stp2_3, stp2_12); 1.2700 + l13 = _mm_sub_epi16(stp2_2, stp2_13); 1.2701 + l14 = _mm_sub_epi16(stp2_1, stp1_14); 1.2702 + l15 = _mm_sub_epi16(stp2_0, stp1_15); 1.2703 + 1.2704 + // 2-D idct. We do 2 8x16 blocks. 1.2705 + for (i = 0; i < 2; i++) { 1.2706 + if (i == 0) 1.2707 + TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, 1.2708 + in5, in6, in7); 1.2709 + 1.2710 + if (i == 1) 1.2711 + TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, 1.2712 + in4, in5, in6, in7); 1.2713 + 1.2714 + in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero; 1.2715 + 1.2716 + IDCT16_1D 1.2717 + 1.2718 + // Stage7 1.2719 + in0 = _mm_add_epi16(stp2_0, stp1_15); 1.2720 + in1 = _mm_add_epi16(stp2_1, stp1_14); 1.2721 + in2 = _mm_add_epi16(stp2_2, stp2_13); 1.2722 + in3 = _mm_add_epi16(stp2_3, stp2_12); 1.2723 + in4 = _mm_add_epi16(stp2_4, stp2_11); 1.2724 + in5 = _mm_add_epi16(stp2_5, stp2_10); 1.2725 + in6 = _mm_add_epi16(stp2_6, stp1_9); 1.2726 + in7 = _mm_add_epi16(stp2_7, stp1_8); 1.2727 + in8 = _mm_sub_epi16(stp2_7, stp1_8); 1.2728 + in9 = _mm_sub_epi16(stp2_6, stp1_9); 1.2729 + in10 = _mm_sub_epi16(stp2_5, stp2_10); 1.2730 + in11 = _mm_sub_epi16(stp2_4, stp2_11); 1.2731 + in12 = _mm_sub_epi16(stp2_3, stp2_12); 1.2732 + in13 = _mm_sub_epi16(stp2_2, stp2_13); 1.2733 + in14 = _mm_sub_epi16(stp2_1, stp1_14); 1.2734 + in15 = _mm_sub_epi16(stp2_0, stp1_15); 1.2735 + 1.2736 + // Final rounding and shift 1.2737 + in0 = _mm_adds_epi16(in0, final_rounding); 1.2738 + in1 = _mm_adds_epi16(in1, final_rounding); 1.2739 + in2 = _mm_adds_epi16(in2, final_rounding); 1.2740 + in3 = _mm_adds_epi16(in3, final_rounding); 1.2741 + in4 = _mm_adds_epi16(in4, final_rounding); 1.2742 + in5 = _mm_adds_epi16(in5, final_rounding); 1.2743 + in6 = _mm_adds_epi16(in6, final_rounding); 1.2744 + in7 = _mm_adds_epi16(in7, final_rounding); 1.2745 + in8 = _mm_adds_epi16(in8, final_rounding); 1.2746 + in9 = _mm_adds_epi16(in9, final_rounding); 1.2747 + in10 = _mm_adds_epi16(in10, final_rounding); 1.2748 + in11 = _mm_adds_epi16(in11, final_rounding); 1.2749 + in12 = _mm_adds_epi16(in12, final_rounding); 1.2750 + in13 = _mm_adds_epi16(in13, final_rounding); 1.2751 + in14 = _mm_adds_epi16(in14, final_rounding); 1.2752 + in15 = _mm_adds_epi16(in15, final_rounding); 1.2753 + 1.2754 + in0 = _mm_srai_epi16(in0, 6); 1.2755 + in1 = _mm_srai_epi16(in1, 6); 1.2756 + in2 = _mm_srai_epi16(in2, 6); 1.2757 + in3 = _mm_srai_epi16(in3, 6); 1.2758 + in4 = _mm_srai_epi16(in4, 6); 1.2759 + in5 = _mm_srai_epi16(in5, 6); 1.2760 + in6 = _mm_srai_epi16(in6, 6); 1.2761 + in7 = _mm_srai_epi16(in7, 6); 1.2762 + in8 = _mm_srai_epi16(in8, 6); 1.2763 + in9 = _mm_srai_epi16(in9, 6); 1.2764 + in10 = _mm_srai_epi16(in10, 6); 1.2765 + in11 = _mm_srai_epi16(in11, 6); 1.2766 + in12 = _mm_srai_epi16(in12, 6); 1.2767 + in13 = _mm_srai_epi16(in13, 6); 1.2768 + in14 = _mm_srai_epi16(in14, 6); 1.2769 + in15 = _mm_srai_epi16(in15, 6); 1.2770 + 1.2771 + RECON_AND_STORE(dest, in0); 1.2772 + RECON_AND_STORE(dest, in1); 1.2773 + RECON_AND_STORE(dest, in2); 1.2774 + RECON_AND_STORE(dest, in3); 1.2775 + RECON_AND_STORE(dest, in4); 1.2776 + RECON_AND_STORE(dest, in5); 1.2777 + RECON_AND_STORE(dest, in6); 1.2778 + RECON_AND_STORE(dest, in7); 1.2779 + RECON_AND_STORE(dest, in8); 1.2780 + RECON_AND_STORE(dest, in9); 1.2781 + RECON_AND_STORE(dest, in10); 1.2782 + RECON_AND_STORE(dest, in11); 1.2783 + RECON_AND_STORE(dest, in12); 1.2784 + RECON_AND_STORE(dest, in13); 1.2785 + RECON_AND_STORE(dest, in14); 1.2786 + RECON_AND_STORE(dest, in15); 1.2787 + 1.2788 + dest += 8 - (stride * 16); 1.2789 + } 1.2790 +} 1.2791 + 1.2792 +#define LOAD_DQCOEFF(reg, input) \ 1.2793 + { \ 1.2794 + reg = _mm_load_si128((const __m128i *) input); \ 1.2795 + input += 8; \ 1.2796 + } \ 1.2797 + 1.2798 +#define IDCT32_1D \ 1.2799 +/* Stage1 */ \ 1.2800 +{ \ 1.2801 + const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \ 1.2802 + const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \ 1.2803 + const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \ 1.2804 + const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \ 1.2805 + \ 1.2806 + const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \ 1.2807 + const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \ 1.2808 + const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \ 1.2809 + const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \ 1.2810 + \ 1.2811 + const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \ 1.2812 + const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \ 1.2813 + const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \ 1.2814 + const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \ 1.2815 + \ 1.2816 + const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \ 1.2817 + const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \ 1.2818 + const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \ 1.2819 + const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \ 1.2820 + \ 1.2821 + MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ 1.2822 + stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ 1.2823 + stp1_17, stp1_30) \ 1.2824 + MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ 1.2825 + stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ 1.2826 + stp1_19, stp1_28) \ 1.2827 + MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ 1.2828 + stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ 1.2829 + stp1_21, stp1_26) \ 1.2830 + MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ 1.2831 + stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ 1.2832 + stp1_23, stp1_24) \ 1.2833 +} \ 1.2834 +\ 1.2835 +/* Stage2 */ \ 1.2836 +{ \ 1.2837 + const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \ 1.2838 + const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \ 1.2839 + const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \ 1.2840 + const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \ 1.2841 + \ 1.2842 + const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \ 1.2843 + const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \ 1.2844 + const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \ 1.2845 + const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \ 1.2846 + \ 1.2847 + MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ 1.2848 + stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ 1.2849 + stp2_14) \ 1.2850 + MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ 1.2851 + stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ 1.2852 + stp2_11, stp2_12) \ 1.2853 + \ 1.2854 + stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ 1.2855 + stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ 1.2856 + stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ 1.2857 + stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ 1.2858 + \ 1.2859 + stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ 1.2860 + stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ 1.2861 + stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ 1.2862 + stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ 1.2863 + \ 1.2864 + stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ 1.2865 + stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ 1.2866 + stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ 1.2867 + stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ 1.2868 + \ 1.2869 + stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ 1.2870 + stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ 1.2871 + stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ 1.2872 + stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ 1.2873 +} \ 1.2874 +\ 1.2875 +/* Stage3 */ \ 1.2876 +{ \ 1.2877 + const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \ 1.2878 + const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \ 1.2879 + const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \ 1.2880 + const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \ 1.2881 + \ 1.2882 + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ 1.2883 + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ 1.2884 + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 1.2885 + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 1.2886 + \ 1.2887 + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 1.2888 + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 1.2889 + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 1.2890 + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 1.2891 + \ 1.2892 + MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ 1.2893 + stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ 1.2894 + stp1_6) \ 1.2895 + \ 1.2896 + stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ 1.2897 + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ 1.2898 + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ 1.2899 + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ 1.2900 + stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ 1.2901 + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ 1.2902 + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ 1.2903 + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ 1.2904 + \ 1.2905 + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ 1.2906 + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ 1.2907 + stp1_18, stp1_29) \ 1.2908 + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ 1.2909 + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ 1.2910 + stp1_22, stp1_25) \ 1.2911 + \ 1.2912 + stp1_16 = stp2_16; \ 1.2913 + stp1_31 = stp2_31; \ 1.2914 + stp1_19 = stp2_19; \ 1.2915 + stp1_20 = stp2_20; \ 1.2916 + stp1_23 = stp2_23; \ 1.2917 + stp1_24 = stp2_24; \ 1.2918 + stp1_27 = stp2_27; \ 1.2919 + stp1_28 = stp2_28; \ 1.2920 +} \ 1.2921 +\ 1.2922 +/* Stage4 */ \ 1.2923 +{ \ 1.2924 + const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \ 1.2925 + const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \ 1.2926 + const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \ 1.2927 + const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \ 1.2928 + \ 1.2929 + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ 1.2930 + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ 1.2931 + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1.2932 + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1.2933 + \ 1.2934 + MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ 1.2935 + stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ 1.2936 + stp2_2, stp2_3) \ 1.2937 + \ 1.2938 + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ 1.2939 + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ 1.2940 + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ 1.2941 + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ 1.2942 + \ 1.2943 + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ 1.2944 + stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ 1.2945 + stp2_10, stp2_13) \ 1.2946 + \ 1.2947 + stp2_8 = stp1_8; \ 1.2948 + stp2_15 = stp1_15; \ 1.2949 + stp2_11 = stp1_11; \ 1.2950 + stp2_12 = stp1_12; \ 1.2951 + \ 1.2952 + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ 1.2953 + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ 1.2954 + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ 1.2955 + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ 1.2956 + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ 1.2957 + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ 1.2958 + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ 1.2959 + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ 1.2960 + \ 1.2961 + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ 1.2962 + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ 1.2963 + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ 1.2964 + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ 1.2965 + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ 1.2966 + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ 1.2967 + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ 1.2968 + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ 1.2969 +} \ 1.2970 +\ 1.2971 +/* Stage5 */ \ 1.2972 +{ \ 1.2973 + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ 1.2974 + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ 1.2975 + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ 1.2976 + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ 1.2977 + \ 1.2978 + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ 1.2979 + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ 1.2980 + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 1.2981 + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 1.2982 + \ 1.2983 + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 1.2984 + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 1.2985 + \ 1.2986 + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ 1.2987 + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ 1.2988 + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ 1.2989 + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ 1.2990 + \ 1.2991 + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ 1.2992 + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ 1.2993 + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ 1.2994 + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ 1.2995 + \ 1.2996 + tmp0 = _mm_add_epi32(tmp0, rounding); \ 1.2997 + tmp1 = _mm_add_epi32(tmp1, rounding); \ 1.2998 + tmp2 = _mm_add_epi32(tmp2, rounding); \ 1.2999 + tmp3 = _mm_add_epi32(tmp3, rounding); \ 1.3000 + \ 1.3001 + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ 1.3002 + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ 1.3003 + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ 1.3004 + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ 1.3005 + \ 1.3006 + stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ 1.3007 + stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ 1.3008 + \ 1.3009 + stp1_4 = stp2_4; \ 1.3010 + stp1_7 = stp2_7; \ 1.3011 + \ 1.3012 + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ 1.3013 + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ 1.3014 + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ 1.3015 + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ 1.3016 + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ 1.3017 + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ 1.3018 + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ 1.3019 + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ 1.3020 + \ 1.3021 + stp1_16 = stp2_16; \ 1.3022 + stp1_17 = stp2_17; \ 1.3023 + \ 1.3024 + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ 1.3025 + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ 1.3026 + stp1_19, stp1_28) \ 1.3027 + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ 1.3028 + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ 1.3029 + stp1_21, stp1_26) \ 1.3030 + \ 1.3031 + stp1_22 = stp2_22; \ 1.3032 + stp1_23 = stp2_23; \ 1.3033 + stp1_24 = stp2_24; \ 1.3034 + stp1_25 = stp2_25; \ 1.3035 + stp1_30 = stp2_30; \ 1.3036 + stp1_31 = stp2_31; \ 1.3037 +} \ 1.3038 +\ 1.3039 +/* Stage6 */ \ 1.3040 +{ \ 1.3041 + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ 1.3042 + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ 1.3043 + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ 1.3044 + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ 1.3045 + \ 1.3046 + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ 1.3047 + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ 1.3048 + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ 1.3049 + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ 1.3050 + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ 1.3051 + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ 1.3052 + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ 1.3053 + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ 1.3054 + \ 1.3055 + stp2_8 = stp1_8; \ 1.3056 + stp2_9 = stp1_9; \ 1.3057 + stp2_14 = stp1_14; \ 1.3058 + stp2_15 = stp1_15; \ 1.3059 + \ 1.3060 + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ 1.3061 + stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ 1.3062 + stp2_13, stp2_11, stp2_12) \ 1.3063 + \ 1.3064 + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ 1.3065 + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ 1.3066 + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ 1.3067 + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ 1.3068 + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ 1.3069 + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ 1.3070 + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ 1.3071 + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ 1.3072 + \ 1.3073 + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ 1.3074 + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ 1.3075 + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ 1.3076 + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ 1.3077 + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ 1.3078 + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ 1.3079 + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ 1.3080 + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ 1.3081 +} \ 1.3082 +\ 1.3083 +/* Stage7 */ \ 1.3084 +{ \ 1.3085 + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ 1.3086 + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ 1.3087 + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ 1.3088 + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ 1.3089 + \ 1.3090 + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ 1.3091 + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ 1.3092 + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ 1.3093 + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ 1.3094 + \ 1.3095 + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ 1.3096 + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ 1.3097 + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ 1.3098 + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ 1.3099 + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ 1.3100 + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ 1.3101 + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ 1.3102 + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ 1.3103 + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ 1.3104 + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ 1.3105 + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ 1.3106 + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ 1.3107 + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ 1.3108 + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ 1.3109 + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ 1.3110 + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ 1.3111 + \ 1.3112 + stp1_16 = stp2_16; \ 1.3113 + stp1_17 = stp2_17; \ 1.3114 + stp1_18 = stp2_18; \ 1.3115 + stp1_19 = stp2_19; \ 1.3116 + \ 1.3117 + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ 1.3118 + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ 1.3119 + stp1_21, stp1_26) \ 1.3120 + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ 1.3121 + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ 1.3122 + stp1_23, stp1_24) \ 1.3123 + \ 1.3124 + stp1_28 = stp2_28; \ 1.3125 + stp1_29 = stp2_29; \ 1.3126 + stp1_30 = stp2_30; \ 1.3127 + stp1_31 = stp2_31; \ 1.3128 +} 1.3129 + 1.3130 +// Only upper-left 8x8 has non-zero coeff 1.3131 +void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, 1.3132 + int stride) { 1.3133 + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.3134 + const __m128i final_rounding = _mm_set1_epi16(1<<5); 1.3135 + 1.3136 + // idct constants for each stage 1.3137 + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 1.3138 + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 1.3139 + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 1.3140 + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 1.3141 + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 1.3142 + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 1.3143 + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 1.3144 + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 1.3145 + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 1.3146 + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 1.3147 + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); 1.3148 + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); 1.3149 + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); 1.3150 + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); 1.3151 + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 1.3152 + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 1.3153 + 1.3154 + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1.3155 + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 1.3156 + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1.3157 + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 1.3158 + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1.3159 + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 1.3160 + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1.3161 + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 1.3162 + 1.3163 + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1.3164 + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1.3165 + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1.3166 + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 1.3167 + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1.3168 + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 1.3169 + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 1.3170 + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1.3171 + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 1.3172 + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 1.3173 + 1.3174 + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1.3175 + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.3176 + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1.3177 + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1.3178 + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1.3179 + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 1.3180 + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1.3181 + 1.3182 + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1.3183 + 1.3184 + __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, 1.3185 + in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, 1.3186 + in24, in25, in26, in27, in28, in29, in30, in31; 1.3187 + __m128i col[128]; 1.3188 + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 1.3189 + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 1.3190 + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 1.3191 + stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 1.3192 + stp1_30, stp1_31; 1.3193 + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 1.3194 + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 1.3195 + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 1.3196 + stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 1.3197 + stp2_30, stp2_31; 1.3198 + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1.3199 + int i, j, i32; 1.3200 + 1.3201 + // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. 1.3202 + for (i = 0; i < 8; i++) { 1.3203 + i32 = (i << 5); 1.3204 + if (i == 0) { 1.3205 + // First 1-D idct: first 8 rows 1.3206 + // Load input data. 1.3207 + LOAD_DQCOEFF(in0, input); 1.3208 + LOAD_DQCOEFF(in8, input); 1.3209 + LOAD_DQCOEFF(in16, input); 1.3210 + LOAD_DQCOEFF(in24, input); 1.3211 + LOAD_DQCOEFF(in1, input); 1.3212 + LOAD_DQCOEFF(in9, input); 1.3213 + LOAD_DQCOEFF(in17, input); 1.3214 + LOAD_DQCOEFF(in25, input); 1.3215 + LOAD_DQCOEFF(in2, input); 1.3216 + LOAD_DQCOEFF(in10, input); 1.3217 + LOAD_DQCOEFF(in18, input); 1.3218 + LOAD_DQCOEFF(in26, input); 1.3219 + LOAD_DQCOEFF(in3, input); 1.3220 + LOAD_DQCOEFF(in11, input); 1.3221 + LOAD_DQCOEFF(in19, input); 1.3222 + LOAD_DQCOEFF(in27, input); 1.3223 + 1.3224 + LOAD_DQCOEFF(in4, input); 1.3225 + LOAD_DQCOEFF(in12, input); 1.3226 + LOAD_DQCOEFF(in20, input); 1.3227 + LOAD_DQCOEFF(in28, input); 1.3228 + LOAD_DQCOEFF(in5, input); 1.3229 + LOAD_DQCOEFF(in13, input); 1.3230 + LOAD_DQCOEFF(in21, input); 1.3231 + LOAD_DQCOEFF(in29, input); 1.3232 + LOAD_DQCOEFF(in6, input); 1.3233 + LOAD_DQCOEFF(in14, input); 1.3234 + LOAD_DQCOEFF(in22, input); 1.3235 + LOAD_DQCOEFF(in30, input); 1.3236 + LOAD_DQCOEFF(in7, input); 1.3237 + LOAD_DQCOEFF(in15, input); 1.3238 + LOAD_DQCOEFF(in23, input); 1.3239 + LOAD_DQCOEFF(in31, input); 1.3240 + 1.3241 + // Transpose 32x8 block to 8x32 block 1.3242 + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 1.3243 + in4, in5, in6, in7); 1.3244 + TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, 1.3245 + in10, in11, in12, in13, in14, in15); 1.3246 + TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, 1.3247 + in18, in19, in20, in21, in22, in23); 1.3248 + TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, 1.3249 + in26, in27, in28, in29, in30, in31); 1.3250 + } else if (i < 4) { 1.3251 + // First 1-D idct: next 24 zero-coeff rows 1.3252 + col[i32 + 0] = _mm_setzero_si128(); 1.3253 + col[i32 + 1] = _mm_setzero_si128(); 1.3254 + col[i32 + 2] = _mm_setzero_si128(); 1.3255 + col[i32 + 3] = _mm_setzero_si128(); 1.3256 + col[i32 + 4] = _mm_setzero_si128(); 1.3257 + col[i32 + 5] = _mm_setzero_si128(); 1.3258 + col[i32 + 6] = _mm_setzero_si128(); 1.3259 + col[i32 + 7] = _mm_setzero_si128(); 1.3260 + col[i32 + 8] = _mm_setzero_si128(); 1.3261 + col[i32 + 9] = _mm_setzero_si128(); 1.3262 + col[i32 + 10] = _mm_setzero_si128(); 1.3263 + col[i32 + 11] = _mm_setzero_si128(); 1.3264 + col[i32 + 12] = _mm_setzero_si128(); 1.3265 + col[i32 + 13] = _mm_setzero_si128(); 1.3266 + col[i32 + 14] = _mm_setzero_si128(); 1.3267 + col[i32 + 15] = _mm_setzero_si128(); 1.3268 + col[i32 + 16] = _mm_setzero_si128(); 1.3269 + col[i32 + 17] = _mm_setzero_si128(); 1.3270 + col[i32 + 18] = _mm_setzero_si128(); 1.3271 + col[i32 + 19] = _mm_setzero_si128(); 1.3272 + col[i32 + 20] = _mm_setzero_si128(); 1.3273 + col[i32 + 21] = _mm_setzero_si128(); 1.3274 + col[i32 + 22] = _mm_setzero_si128(); 1.3275 + col[i32 + 23] = _mm_setzero_si128(); 1.3276 + col[i32 + 24] = _mm_setzero_si128(); 1.3277 + col[i32 + 25] = _mm_setzero_si128(); 1.3278 + col[i32 + 26] = _mm_setzero_si128(); 1.3279 + col[i32 + 27] = _mm_setzero_si128(); 1.3280 + col[i32 + 28] = _mm_setzero_si128(); 1.3281 + col[i32 + 29] = _mm_setzero_si128(); 1.3282 + col[i32 + 30] = _mm_setzero_si128(); 1.3283 + col[i32 + 31] = _mm_setzero_si128(); 1.3284 + continue; 1.3285 + } else { 1.3286 + // Second 1-D idct 1.3287 + j = i - 4; 1.3288 + 1.3289 + // Transpose 32x8 block to 8x32 block 1.3290 + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 1.3291 + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 1.3292 + col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, 1.3293 + in5, in6, in7); 1.3294 + j += 4; 1.3295 + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 1.3296 + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 1.3297 + col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, 1.3298 + in11, in12, in13, in14, in15); 1.3299 + j += 4; 1.3300 + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 1.3301 + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 1.3302 + col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, 1.3303 + in19, in20, in21, in22, in23); 1.3304 + j += 4; 1.3305 + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 1.3306 + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 1.3307 + col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, 1.3308 + in28, in29, in30, in31); 1.3309 + } 1.3310 + 1.3311 + IDCT32_1D 1.3312 + 1.3313 + // final stage 1.3314 + if (i < 4) { 1.3315 + // 1_D: Store 32 intermediate results for each 8x32 block. 1.3316 + col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); 1.3317 + col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); 1.3318 + col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); 1.3319 + col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); 1.3320 + col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); 1.3321 + col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); 1.3322 + col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); 1.3323 + col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); 1.3324 + col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); 1.3325 + col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); 1.3326 + col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); 1.3327 + col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); 1.3328 + col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); 1.3329 + col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); 1.3330 + col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); 1.3331 + col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); 1.3332 + col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); 1.3333 + col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); 1.3334 + col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); 1.3335 + col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); 1.3336 + col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); 1.3337 + col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); 1.3338 + col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); 1.3339 + col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); 1.3340 + col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); 1.3341 + col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); 1.3342 + col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); 1.3343 + col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); 1.3344 + col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); 1.3345 + col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); 1.3346 + col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); 1.3347 + col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); 1.3348 + } else { 1.3349 + const __m128i zero = _mm_setzero_si128(); 1.3350 + 1.3351 + // 2_D: Calculate the results and store them to destination. 1.3352 + in0 = _mm_add_epi16(stp1_0, stp1_31); 1.3353 + in1 = _mm_add_epi16(stp1_1, stp1_30); 1.3354 + in2 = _mm_add_epi16(stp1_2, stp1_29); 1.3355 + in3 = _mm_add_epi16(stp1_3, stp1_28); 1.3356 + in4 = _mm_add_epi16(stp1_4, stp1_27); 1.3357 + in5 = _mm_add_epi16(stp1_5, stp1_26); 1.3358 + in6 = _mm_add_epi16(stp1_6, stp1_25); 1.3359 + in7 = _mm_add_epi16(stp1_7, stp1_24); 1.3360 + in8 = _mm_add_epi16(stp1_8, stp1_23); 1.3361 + in9 = _mm_add_epi16(stp1_9, stp1_22); 1.3362 + in10 = _mm_add_epi16(stp1_10, stp1_21); 1.3363 + in11 = _mm_add_epi16(stp1_11, stp1_20); 1.3364 + in12 = _mm_add_epi16(stp1_12, stp1_19); 1.3365 + in13 = _mm_add_epi16(stp1_13, stp1_18); 1.3366 + in14 = _mm_add_epi16(stp1_14, stp1_17); 1.3367 + in15 = _mm_add_epi16(stp1_15, stp1_16); 1.3368 + in16 = _mm_sub_epi16(stp1_15, stp1_16); 1.3369 + in17 = _mm_sub_epi16(stp1_14, stp1_17); 1.3370 + in18 = _mm_sub_epi16(stp1_13, stp1_18); 1.3371 + in19 = _mm_sub_epi16(stp1_12, stp1_19); 1.3372 + in20 = _mm_sub_epi16(stp1_11, stp1_20); 1.3373 + in21 = _mm_sub_epi16(stp1_10, stp1_21); 1.3374 + in22 = _mm_sub_epi16(stp1_9, stp1_22); 1.3375 + in23 = _mm_sub_epi16(stp1_8, stp1_23); 1.3376 + in24 = _mm_sub_epi16(stp1_7, stp1_24); 1.3377 + in25 = _mm_sub_epi16(stp1_6, stp1_25); 1.3378 + in26 = _mm_sub_epi16(stp1_5, stp1_26); 1.3379 + in27 = _mm_sub_epi16(stp1_4, stp1_27); 1.3380 + in28 = _mm_sub_epi16(stp1_3, stp1_28); 1.3381 + in29 = _mm_sub_epi16(stp1_2, stp1_29); 1.3382 + in30 = _mm_sub_epi16(stp1_1, stp1_30); 1.3383 + in31 = _mm_sub_epi16(stp1_0, stp1_31); 1.3384 + 1.3385 + // Final rounding and shift 1.3386 + in0 = _mm_adds_epi16(in0, final_rounding); 1.3387 + in1 = _mm_adds_epi16(in1, final_rounding); 1.3388 + in2 = _mm_adds_epi16(in2, final_rounding); 1.3389 + in3 = _mm_adds_epi16(in3, final_rounding); 1.3390 + in4 = _mm_adds_epi16(in4, final_rounding); 1.3391 + in5 = _mm_adds_epi16(in5, final_rounding); 1.3392 + in6 = _mm_adds_epi16(in6, final_rounding); 1.3393 + in7 = _mm_adds_epi16(in7, final_rounding); 1.3394 + in8 = _mm_adds_epi16(in8, final_rounding); 1.3395 + in9 = _mm_adds_epi16(in9, final_rounding); 1.3396 + in10 = _mm_adds_epi16(in10, final_rounding); 1.3397 + in11 = _mm_adds_epi16(in11, final_rounding); 1.3398 + in12 = _mm_adds_epi16(in12, final_rounding); 1.3399 + in13 = _mm_adds_epi16(in13, final_rounding); 1.3400 + in14 = _mm_adds_epi16(in14, final_rounding); 1.3401 + in15 = _mm_adds_epi16(in15, final_rounding); 1.3402 + in16 = _mm_adds_epi16(in16, final_rounding); 1.3403 + in17 = _mm_adds_epi16(in17, final_rounding); 1.3404 + in18 = _mm_adds_epi16(in18, final_rounding); 1.3405 + in19 = _mm_adds_epi16(in19, final_rounding); 1.3406 + in20 = _mm_adds_epi16(in20, final_rounding); 1.3407 + in21 = _mm_adds_epi16(in21, final_rounding); 1.3408 + in22 = _mm_adds_epi16(in22, final_rounding); 1.3409 + in23 = _mm_adds_epi16(in23, final_rounding); 1.3410 + in24 = _mm_adds_epi16(in24, final_rounding); 1.3411 + in25 = _mm_adds_epi16(in25, final_rounding); 1.3412 + in26 = _mm_adds_epi16(in26, final_rounding); 1.3413 + in27 = _mm_adds_epi16(in27, final_rounding); 1.3414 + in28 = _mm_adds_epi16(in28, final_rounding); 1.3415 + in29 = _mm_adds_epi16(in29, final_rounding); 1.3416 + in30 = _mm_adds_epi16(in30, final_rounding); 1.3417 + in31 = _mm_adds_epi16(in31, final_rounding); 1.3418 + 1.3419 + in0 = _mm_srai_epi16(in0, 6); 1.3420 + in1 = _mm_srai_epi16(in1, 6); 1.3421 + in2 = _mm_srai_epi16(in2, 6); 1.3422 + in3 = _mm_srai_epi16(in3, 6); 1.3423 + in4 = _mm_srai_epi16(in4, 6); 1.3424 + in5 = _mm_srai_epi16(in5, 6); 1.3425 + in6 = _mm_srai_epi16(in6, 6); 1.3426 + in7 = _mm_srai_epi16(in7, 6); 1.3427 + in8 = _mm_srai_epi16(in8, 6); 1.3428 + in9 = _mm_srai_epi16(in9, 6); 1.3429 + in10 = _mm_srai_epi16(in10, 6); 1.3430 + in11 = _mm_srai_epi16(in11, 6); 1.3431 + in12 = _mm_srai_epi16(in12, 6); 1.3432 + in13 = _mm_srai_epi16(in13, 6); 1.3433 + in14 = _mm_srai_epi16(in14, 6); 1.3434 + in15 = _mm_srai_epi16(in15, 6); 1.3435 + in16 = _mm_srai_epi16(in16, 6); 1.3436 + in17 = _mm_srai_epi16(in17, 6); 1.3437 + in18 = _mm_srai_epi16(in18, 6); 1.3438 + in19 = _mm_srai_epi16(in19, 6); 1.3439 + in20 = _mm_srai_epi16(in20, 6); 1.3440 + in21 = _mm_srai_epi16(in21, 6); 1.3441 + in22 = _mm_srai_epi16(in22, 6); 1.3442 + in23 = _mm_srai_epi16(in23, 6); 1.3443 + in24 = _mm_srai_epi16(in24, 6); 1.3444 + in25 = _mm_srai_epi16(in25, 6); 1.3445 + in26 = _mm_srai_epi16(in26, 6); 1.3446 + in27 = _mm_srai_epi16(in27, 6); 1.3447 + in28 = _mm_srai_epi16(in28, 6); 1.3448 + in29 = _mm_srai_epi16(in29, 6); 1.3449 + in30 = _mm_srai_epi16(in30, 6); 1.3450 + in31 = _mm_srai_epi16(in31, 6); 1.3451 + 1.3452 + RECON_AND_STORE(dest, in0); 1.3453 + RECON_AND_STORE(dest, in1); 1.3454 + RECON_AND_STORE(dest, in2); 1.3455 + RECON_AND_STORE(dest, in3); 1.3456 + RECON_AND_STORE(dest, in4); 1.3457 + RECON_AND_STORE(dest, in5); 1.3458 + RECON_AND_STORE(dest, in6); 1.3459 + RECON_AND_STORE(dest, in7); 1.3460 + RECON_AND_STORE(dest, in8); 1.3461 + RECON_AND_STORE(dest, in9); 1.3462 + RECON_AND_STORE(dest, in10); 1.3463 + RECON_AND_STORE(dest, in11); 1.3464 + RECON_AND_STORE(dest, in12); 1.3465 + RECON_AND_STORE(dest, in13); 1.3466 + RECON_AND_STORE(dest, in14); 1.3467 + RECON_AND_STORE(dest, in15); 1.3468 + RECON_AND_STORE(dest, in16); 1.3469 + RECON_AND_STORE(dest, in17); 1.3470 + RECON_AND_STORE(dest, in18); 1.3471 + RECON_AND_STORE(dest, in19); 1.3472 + RECON_AND_STORE(dest, in20); 1.3473 + RECON_AND_STORE(dest, in21); 1.3474 + RECON_AND_STORE(dest, in22); 1.3475 + RECON_AND_STORE(dest, in23); 1.3476 + RECON_AND_STORE(dest, in24); 1.3477 + RECON_AND_STORE(dest, in25); 1.3478 + RECON_AND_STORE(dest, in26); 1.3479 + RECON_AND_STORE(dest, in27); 1.3480 + RECON_AND_STORE(dest, in28); 1.3481 + RECON_AND_STORE(dest, in29); 1.3482 + RECON_AND_STORE(dest, in30); 1.3483 + RECON_AND_STORE(dest, in31); 1.3484 + 1.3485 + dest += 8 - (stride * 32); 1.3486 + } 1.3487 + } 1.3488 +} 1.3489 + 1.3490 +void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, 1.3491 + int stride) { 1.3492 + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.3493 + const __m128i final_rounding = _mm_set1_epi16(1<<5); 1.3494 + 1.3495 + // idct constants for each stage 1.3496 + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); 1.3497 + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); 1.3498 + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); 1.3499 + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); 1.3500 + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); 1.3501 + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); 1.3502 + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); 1.3503 + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); 1.3504 + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); 1.3505 + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); 1.3506 + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); 1.3507 + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); 1.3508 + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); 1.3509 + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); 1.3510 + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); 1.3511 + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); 1.3512 + 1.3513 + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); 1.3514 + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); 1.3515 + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); 1.3516 + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); 1.3517 + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); 1.3518 + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); 1.3519 + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); 1.3520 + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); 1.3521 + 1.3522 + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); 1.3523 + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); 1.3524 + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); 1.3525 + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); 1.3526 + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1.3527 + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); 1.3528 + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 1.3529 + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1.3530 + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); 1.3531 + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 1.3532 + 1.3533 + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); 1.3534 + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); 1.3535 + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); 1.3536 + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); 1.3537 + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1.3538 + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); 1.3539 + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1.3540 + 1.3541 + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); 1.3542 + 1.3543 + __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, 1.3544 + in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, 1.3545 + in24, in25, in26, in27, in28, in29, in30, in31; 1.3546 + __m128i col[128]; 1.3547 + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, 1.3548 + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, 1.3549 + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, 1.3550 + stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, 1.3551 + stp1_30, stp1_31; 1.3552 + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, 1.3553 + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, 1.3554 + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, 1.3555 + stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, 1.3556 + stp2_30, stp2_31; 1.3557 + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; 1.3558 + int i, j, i32; 1.3559 + __m128i zero_idx[16]; 1.3560 + int zero_flag[2]; 1.3561 + 1.3562 + // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. 1.3563 + for (i = 0; i < 8; i++) { 1.3564 + i32 = (i << 5); 1.3565 + if (i < 4) { 1.3566 + // First 1-D idct 1.3567 + // Load input data. 1.3568 + LOAD_DQCOEFF(in0, input); 1.3569 + LOAD_DQCOEFF(in8, input); 1.3570 + LOAD_DQCOEFF(in16, input); 1.3571 + LOAD_DQCOEFF(in24, input); 1.3572 + LOAD_DQCOEFF(in1, input); 1.3573 + LOAD_DQCOEFF(in9, input); 1.3574 + LOAD_DQCOEFF(in17, input); 1.3575 + LOAD_DQCOEFF(in25, input); 1.3576 + LOAD_DQCOEFF(in2, input); 1.3577 + LOAD_DQCOEFF(in10, input); 1.3578 + LOAD_DQCOEFF(in18, input); 1.3579 + LOAD_DQCOEFF(in26, input); 1.3580 + LOAD_DQCOEFF(in3, input); 1.3581 + LOAD_DQCOEFF(in11, input); 1.3582 + LOAD_DQCOEFF(in19, input); 1.3583 + LOAD_DQCOEFF(in27, input); 1.3584 + 1.3585 + LOAD_DQCOEFF(in4, input); 1.3586 + LOAD_DQCOEFF(in12, input); 1.3587 + LOAD_DQCOEFF(in20, input); 1.3588 + LOAD_DQCOEFF(in28, input); 1.3589 + LOAD_DQCOEFF(in5, input); 1.3590 + LOAD_DQCOEFF(in13, input); 1.3591 + LOAD_DQCOEFF(in21, input); 1.3592 + LOAD_DQCOEFF(in29, input); 1.3593 + LOAD_DQCOEFF(in6, input); 1.3594 + LOAD_DQCOEFF(in14, input); 1.3595 + LOAD_DQCOEFF(in22, input); 1.3596 + LOAD_DQCOEFF(in30, input); 1.3597 + LOAD_DQCOEFF(in7, input); 1.3598 + LOAD_DQCOEFF(in15, input); 1.3599 + LOAD_DQCOEFF(in23, input); 1.3600 + LOAD_DQCOEFF(in31, input); 1.3601 + 1.3602 + // checking if all entries are zero 1.3603 + zero_idx[0] = _mm_or_si128(in0, in1); 1.3604 + zero_idx[1] = _mm_or_si128(in2, in3); 1.3605 + zero_idx[2] = _mm_or_si128(in4, in5); 1.3606 + zero_idx[3] = _mm_or_si128(in6, in7); 1.3607 + zero_idx[4] = _mm_or_si128(in8, in9); 1.3608 + zero_idx[5] = _mm_or_si128(in10, in11); 1.3609 + zero_idx[6] = _mm_or_si128(in12, in13); 1.3610 + zero_idx[7] = _mm_or_si128(in14, in15); 1.3611 + zero_idx[8] = _mm_or_si128(in16, in17); 1.3612 + zero_idx[9] = _mm_or_si128(in18, in19); 1.3613 + zero_idx[10] = _mm_or_si128(in20, in21); 1.3614 + zero_idx[11] = _mm_or_si128(in22, in23); 1.3615 + zero_idx[12] = _mm_or_si128(in24, in25); 1.3616 + zero_idx[13] = _mm_or_si128(in26, in27); 1.3617 + zero_idx[14] = _mm_or_si128(in28, in29); 1.3618 + zero_idx[15] = _mm_or_si128(in30, in31); 1.3619 + 1.3620 + zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); 1.3621 + zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); 1.3622 + zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); 1.3623 + zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); 1.3624 + zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); 1.3625 + zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); 1.3626 + zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); 1.3627 + zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); 1.3628 + 1.3629 + zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); 1.3630 + zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); 1.3631 + zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); 1.3632 + zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); 1.3633 + zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); 1.3634 + zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); 1.3635 + zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); 1.3636 + 1.3637 + zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]); 1.3638 + zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]); 1.3639 + zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32); 1.3640 + zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]); 1.3641 + zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]); 1.3642 + 1.3643 + if (!zero_flag[0] && !zero_flag[1]) { 1.3644 + col[i32 + 0] = _mm_setzero_si128(); 1.3645 + col[i32 + 1] = _mm_setzero_si128(); 1.3646 + col[i32 + 2] = _mm_setzero_si128(); 1.3647 + col[i32 + 3] = _mm_setzero_si128(); 1.3648 + col[i32 + 4] = _mm_setzero_si128(); 1.3649 + col[i32 + 5] = _mm_setzero_si128(); 1.3650 + col[i32 + 6] = _mm_setzero_si128(); 1.3651 + col[i32 + 7] = _mm_setzero_si128(); 1.3652 + col[i32 + 8] = _mm_setzero_si128(); 1.3653 + col[i32 + 9] = _mm_setzero_si128(); 1.3654 + col[i32 + 10] = _mm_setzero_si128(); 1.3655 + col[i32 + 11] = _mm_setzero_si128(); 1.3656 + col[i32 + 12] = _mm_setzero_si128(); 1.3657 + col[i32 + 13] = _mm_setzero_si128(); 1.3658 + col[i32 + 14] = _mm_setzero_si128(); 1.3659 + col[i32 + 15] = _mm_setzero_si128(); 1.3660 + col[i32 + 16] = _mm_setzero_si128(); 1.3661 + col[i32 + 17] = _mm_setzero_si128(); 1.3662 + col[i32 + 18] = _mm_setzero_si128(); 1.3663 + col[i32 + 19] = _mm_setzero_si128(); 1.3664 + col[i32 + 20] = _mm_setzero_si128(); 1.3665 + col[i32 + 21] = _mm_setzero_si128(); 1.3666 + col[i32 + 22] = _mm_setzero_si128(); 1.3667 + col[i32 + 23] = _mm_setzero_si128(); 1.3668 + col[i32 + 24] = _mm_setzero_si128(); 1.3669 + col[i32 + 25] = _mm_setzero_si128(); 1.3670 + col[i32 + 26] = _mm_setzero_si128(); 1.3671 + col[i32 + 27] = _mm_setzero_si128(); 1.3672 + col[i32 + 28] = _mm_setzero_si128(); 1.3673 + col[i32 + 29] = _mm_setzero_si128(); 1.3674 + col[i32 + 30] = _mm_setzero_si128(); 1.3675 + col[i32 + 31] = _mm_setzero_si128(); 1.3676 + continue; 1.3677 + } 1.3678 + 1.3679 + // Transpose 32x8 block to 8x32 block 1.3680 + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, 1.3681 + in4, in5, in6, in7); 1.3682 + TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, 1.3683 + in10, in11, in12, in13, in14, in15); 1.3684 + TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, 1.3685 + in18, in19, in20, in21, in22, in23); 1.3686 + TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, 1.3687 + in26, in27, in28, in29, in30, in31); 1.3688 + } else { 1.3689 + // Second 1-D idct 1.3690 + j = i - 4; 1.3691 + 1.3692 + // Transpose 32x8 block to 8x32 block 1.3693 + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 1.3694 + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 1.3695 + col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, 1.3696 + in5, in6, in7); 1.3697 + j += 4; 1.3698 + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 1.3699 + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 1.3700 + col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, 1.3701 + in11, in12, in13, in14, in15); 1.3702 + j += 4; 1.3703 + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 1.3704 + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 1.3705 + col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, 1.3706 + in19, in20, in21, in22, in23); 1.3707 + j += 4; 1.3708 + TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], 1.3709 + col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], 1.3710 + col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, 1.3711 + in28, in29, in30, in31); 1.3712 + } 1.3713 + 1.3714 + IDCT32_1D 1.3715 + 1.3716 + // final stage 1.3717 + if (i < 4) { 1.3718 + // 1_D: Store 32 intermediate results for each 8x32 block. 1.3719 + col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); 1.3720 + col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); 1.3721 + col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); 1.3722 + col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); 1.3723 + col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); 1.3724 + col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); 1.3725 + col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); 1.3726 + col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); 1.3727 + col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); 1.3728 + col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); 1.3729 + col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); 1.3730 + col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); 1.3731 + col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); 1.3732 + col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); 1.3733 + col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); 1.3734 + col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); 1.3735 + col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); 1.3736 + col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); 1.3737 + col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); 1.3738 + col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); 1.3739 + col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); 1.3740 + col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); 1.3741 + col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); 1.3742 + col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); 1.3743 + col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); 1.3744 + col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); 1.3745 + col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); 1.3746 + col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); 1.3747 + col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); 1.3748 + col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); 1.3749 + col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); 1.3750 + col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); 1.3751 + } else { 1.3752 + const __m128i zero = _mm_setzero_si128(); 1.3753 + 1.3754 + // 2_D: Calculate the results and store them to destination. 1.3755 + in0 = _mm_add_epi16(stp1_0, stp1_31); 1.3756 + in1 = _mm_add_epi16(stp1_1, stp1_30); 1.3757 + in2 = _mm_add_epi16(stp1_2, stp1_29); 1.3758 + in3 = _mm_add_epi16(stp1_3, stp1_28); 1.3759 + in4 = _mm_add_epi16(stp1_4, stp1_27); 1.3760 + in5 = _mm_add_epi16(stp1_5, stp1_26); 1.3761 + in6 = _mm_add_epi16(stp1_6, stp1_25); 1.3762 + in7 = _mm_add_epi16(stp1_7, stp1_24); 1.3763 + in8 = _mm_add_epi16(stp1_8, stp1_23); 1.3764 + in9 = _mm_add_epi16(stp1_9, stp1_22); 1.3765 + in10 = _mm_add_epi16(stp1_10, stp1_21); 1.3766 + in11 = _mm_add_epi16(stp1_11, stp1_20); 1.3767 + in12 = _mm_add_epi16(stp1_12, stp1_19); 1.3768 + in13 = _mm_add_epi16(stp1_13, stp1_18); 1.3769 + in14 = _mm_add_epi16(stp1_14, stp1_17); 1.3770 + in15 = _mm_add_epi16(stp1_15, stp1_16); 1.3771 + in16 = _mm_sub_epi16(stp1_15, stp1_16); 1.3772 + in17 = _mm_sub_epi16(stp1_14, stp1_17); 1.3773 + in18 = _mm_sub_epi16(stp1_13, stp1_18); 1.3774 + in19 = _mm_sub_epi16(stp1_12, stp1_19); 1.3775 + in20 = _mm_sub_epi16(stp1_11, stp1_20); 1.3776 + in21 = _mm_sub_epi16(stp1_10, stp1_21); 1.3777 + in22 = _mm_sub_epi16(stp1_9, stp1_22); 1.3778 + in23 = _mm_sub_epi16(stp1_8, stp1_23); 1.3779 + in24 = _mm_sub_epi16(stp1_7, stp1_24); 1.3780 + in25 = _mm_sub_epi16(stp1_6, stp1_25); 1.3781 + in26 = _mm_sub_epi16(stp1_5, stp1_26); 1.3782 + in27 = _mm_sub_epi16(stp1_4, stp1_27); 1.3783 + in28 = _mm_sub_epi16(stp1_3, stp1_28); 1.3784 + in29 = _mm_sub_epi16(stp1_2, stp1_29); 1.3785 + in30 = _mm_sub_epi16(stp1_1, stp1_30); 1.3786 + in31 = _mm_sub_epi16(stp1_0, stp1_31); 1.3787 + 1.3788 + // Final rounding and shift 1.3789 + in0 = _mm_adds_epi16(in0, final_rounding); 1.3790 + in1 = _mm_adds_epi16(in1, final_rounding); 1.3791 + in2 = _mm_adds_epi16(in2, final_rounding); 1.3792 + in3 = _mm_adds_epi16(in3, final_rounding); 1.3793 + in4 = _mm_adds_epi16(in4, final_rounding); 1.3794 + in5 = _mm_adds_epi16(in5, final_rounding); 1.3795 + in6 = _mm_adds_epi16(in6, final_rounding); 1.3796 + in7 = _mm_adds_epi16(in7, final_rounding); 1.3797 + in8 = _mm_adds_epi16(in8, final_rounding); 1.3798 + in9 = _mm_adds_epi16(in9, final_rounding); 1.3799 + in10 = _mm_adds_epi16(in10, final_rounding); 1.3800 + in11 = _mm_adds_epi16(in11, final_rounding); 1.3801 + in12 = _mm_adds_epi16(in12, final_rounding); 1.3802 + in13 = _mm_adds_epi16(in13, final_rounding); 1.3803 + in14 = _mm_adds_epi16(in14, final_rounding); 1.3804 + in15 = _mm_adds_epi16(in15, final_rounding); 1.3805 + in16 = _mm_adds_epi16(in16, final_rounding); 1.3806 + in17 = _mm_adds_epi16(in17, final_rounding); 1.3807 + in18 = _mm_adds_epi16(in18, final_rounding); 1.3808 + in19 = _mm_adds_epi16(in19, final_rounding); 1.3809 + in20 = _mm_adds_epi16(in20, final_rounding); 1.3810 + in21 = _mm_adds_epi16(in21, final_rounding); 1.3811 + in22 = _mm_adds_epi16(in22, final_rounding); 1.3812 + in23 = _mm_adds_epi16(in23, final_rounding); 1.3813 + in24 = _mm_adds_epi16(in24, final_rounding); 1.3814 + in25 = _mm_adds_epi16(in25, final_rounding); 1.3815 + in26 = _mm_adds_epi16(in26, final_rounding); 1.3816 + in27 = _mm_adds_epi16(in27, final_rounding); 1.3817 + in28 = _mm_adds_epi16(in28, final_rounding); 1.3818 + in29 = _mm_adds_epi16(in29, final_rounding); 1.3819 + in30 = _mm_adds_epi16(in30, final_rounding); 1.3820 + in31 = _mm_adds_epi16(in31, final_rounding); 1.3821 + 1.3822 + in0 = _mm_srai_epi16(in0, 6); 1.3823 + in1 = _mm_srai_epi16(in1, 6); 1.3824 + in2 = _mm_srai_epi16(in2, 6); 1.3825 + in3 = _mm_srai_epi16(in3, 6); 1.3826 + in4 = _mm_srai_epi16(in4, 6); 1.3827 + in5 = _mm_srai_epi16(in5, 6); 1.3828 + in6 = _mm_srai_epi16(in6, 6); 1.3829 + in7 = _mm_srai_epi16(in7, 6); 1.3830 + in8 = _mm_srai_epi16(in8, 6); 1.3831 + in9 = _mm_srai_epi16(in9, 6); 1.3832 + in10 = _mm_srai_epi16(in10, 6); 1.3833 + in11 = _mm_srai_epi16(in11, 6); 1.3834 + in12 = _mm_srai_epi16(in12, 6); 1.3835 + in13 = _mm_srai_epi16(in13, 6); 1.3836 + in14 = _mm_srai_epi16(in14, 6); 1.3837 + in15 = _mm_srai_epi16(in15, 6); 1.3838 + in16 = _mm_srai_epi16(in16, 6); 1.3839 + in17 = _mm_srai_epi16(in17, 6); 1.3840 + in18 = _mm_srai_epi16(in18, 6); 1.3841 + in19 = _mm_srai_epi16(in19, 6); 1.3842 + in20 = _mm_srai_epi16(in20, 6); 1.3843 + in21 = _mm_srai_epi16(in21, 6); 1.3844 + in22 = _mm_srai_epi16(in22, 6); 1.3845 + in23 = _mm_srai_epi16(in23, 6); 1.3846 + in24 = _mm_srai_epi16(in24, 6); 1.3847 + in25 = _mm_srai_epi16(in25, 6); 1.3848 + in26 = _mm_srai_epi16(in26, 6); 1.3849 + in27 = _mm_srai_epi16(in27, 6); 1.3850 + in28 = _mm_srai_epi16(in28, 6); 1.3851 + in29 = _mm_srai_epi16(in29, 6); 1.3852 + in30 = _mm_srai_epi16(in30, 6); 1.3853 + in31 = _mm_srai_epi16(in31, 6); 1.3854 + 1.3855 + RECON_AND_STORE(dest, in0); 1.3856 + RECON_AND_STORE(dest, in1); 1.3857 + RECON_AND_STORE(dest, in2); 1.3858 + RECON_AND_STORE(dest, in3); 1.3859 + RECON_AND_STORE(dest, in4); 1.3860 + RECON_AND_STORE(dest, in5); 1.3861 + RECON_AND_STORE(dest, in6); 1.3862 + RECON_AND_STORE(dest, in7); 1.3863 + RECON_AND_STORE(dest, in8); 1.3864 + RECON_AND_STORE(dest, in9); 1.3865 + RECON_AND_STORE(dest, in10); 1.3866 + RECON_AND_STORE(dest, in11); 1.3867 + RECON_AND_STORE(dest, in12); 1.3868 + RECON_AND_STORE(dest, in13); 1.3869 + RECON_AND_STORE(dest, in14); 1.3870 + RECON_AND_STORE(dest, in15); 1.3871 + RECON_AND_STORE(dest, in16); 1.3872 + RECON_AND_STORE(dest, in17); 1.3873 + RECON_AND_STORE(dest, in18); 1.3874 + RECON_AND_STORE(dest, in19); 1.3875 + RECON_AND_STORE(dest, in20); 1.3876 + RECON_AND_STORE(dest, in21); 1.3877 + RECON_AND_STORE(dest, in22); 1.3878 + RECON_AND_STORE(dest, in23); 1.3879 + RECON_AND_STORE(dest, in24); 1.3880 + RECON_AND_STORE(dest, in25); 1.3881 + RECON_AND_STORE(dest, in26); 1.3882 + RECON_AND_STORE(dest, in27); 1.3883 + RECON_AND_STORE(dest, in28); 1.3884 + RECON_AND_STORE(dest, in29); 1.3885 + RECON_AND_STORE(dest, in30); 1.3886 + RECON_AND_STORE(dest, in31); 1.3887 + 1.3888 + dest += 8 - (stride * 32); 1.3889 + } 1.3890 + } 1.3891 +} //NOLINT 1.3892 + 1.3893 +void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { 1.3894 + __m128i dc_value; 1.3895 + const __m128i zero = _mm_setzero_si128(); 1.3896 + int a, i; 1.3897 + 1.3898 + a = dct_const_round_shift(input[0] * cospi_16_64); 1.3899 + a = dct_const_round_shift(a * cospi_16_64); 1.3900 + a = ROUND_POWER_OF_TWO(a, 6); 1.3901 + 1.3902 + dc_value = _mm_set1_epi16(a); 1.3903 + 1.3904 + for (i = 0; i < 4; ++i) { 1.3905 + RECON_AND_STORE(dest, dc_value); 1.3906 + RECON_AND_STORE(dest, dc_value); 1.3907 + RECON_AND_STORE(dest, dc_value); 1.3908 + RECON_AND_STORE(dest, dc_value); 1.3909 + RECON_AND_STORE(dest, dc_value); 1.3910 + RECON_AND_STORE(dest, dc_value); 1.3911 + RECON_AND_STORE(dest, dc_value); 1.3912 + RECON_AND_STORE(dest, dc_value); 1.3913 + RECON_AND_STORE(dest, dc_value); 1.3914 + RECON_AND_STORE(dest, dc_value); 1.3915 + RECON_AND_STORE(dest, dc_value); 1.3916 + RECON_AND_STORE(dest, dc_value); 1.3917 + RECON_AND_STORE(dest, dc_value); 1.3918 + RECON_AND_STORE(dest, dc_value); 1.3919 + RECON_AND_STORE(dest, dc_value); 1.3920 + RECON_AND_STORE(dest, dc_value); 1.3921 + RECON_AND_STORE(dest, dc_value); 1.3922 + RECON_AND_STORE(dest, dc_value); 1.3923 + RECON_AND_STORE(dest, dc_value); 1.3924 + RECON_AND_STORE(dest, dc_value); 1.3925 + RECON_AND_STORE(dest, dc_value); 1.3926 + RECON_AND_STORE(dest, dc_value); 1.3927 + RECON_AND_STORE(dest, dc_value); 1.3928 + RECON_AND_STORE(dest, dc_value); 1.3929 + RECON_AND_STORE(dest, dc_value); 1.3930 + RECON_AND_STORE(dest, dc_value); 1.3931 + RECON_AND_STORE(dest, dc_value); 1.3932 + RECON_AND_STORE(dest, dc_value); 1.3933 + RECON_AND_STORE(dest, dc_value); 1.3934 + RECON_AND_STORE(dest, dc_value); 1.3935 + RECON_AND_STORE(dest, dc_value); 1.3936 + RECON_AND_STORE(dest, dc_value); 1.3937 + dest += 8 - (stride * 32); 1.3938 + } 1.3939 +}