1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,2688 @@ 1.4 +/* 1.5 + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license 1.8 + * that can be found in the LICENSE file in the root of the source 1.9 + * tree. An additional intellectual property rights grant can be found 1.10 + * in the file PATENTS. All contributing project authors may 1.11 + * be found in the AUTHORS file in the root of the source tree. 1.12 + */ 1.13 + 1.14 +#include <emmintrin.h> // SSE2 1.15 +#include "vp9/common/vp9_idct.h" // for cospi constants 1.16 +#include "vpx_ports/mem.h" 1.17 + 1.18 +#if FDCT32x32_HIGH_PRECISION 1.19 +static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { 1.20 + __m128i buf0, buf1; 1.21 + buf0 = _mm_mul_epu32(a, b); 1.22 + a = _mm_srli_epi64(a, 32); 1.23 + b = _mm_srli_epi64(b, 32); 1.24 + buf1 = _mm_mul_epu32(a, b); 1.25 + return _mm_add_epi64(buf0, buf1); 1.26 +} 1.27 + 1.28 +static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { 1.29 + __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); 1.30 + __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); 1.31 + return _mm_unpacklo_epi64(buf0, buf1); 1.32 +} 1.33 +#endif 1.34 + 1.35 +void FDCT32x32_2D(const int16_t *input, 1.36 + int16_t *output_org, int stride) { 1.37 + // Calculate pre-multiplied strides 1.38 + const int str1 = stride; 1.39 + const int str2 = 2 * stride; 1.40 + const int str3 = 2 * stride + str1; 1.41 + // We need an intermediate buffer between passes. 1.42 + DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); 1.43 + // Constants 1.44 + // When we use them, in one case, they are all the same. In all others 1.45 + // it's a pair of them that we need to repeat four times. This is done 1.46 + // by constructing the 32 bit constant corresponding to that pair. 1.47 + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64); 1.48 + const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); 1.49 + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); 1.50 + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); 1.51 + const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); 1.52 + const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); 1.53 + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); 1.54 + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); 1.55 + const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); 1.56 + const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); 1.57 + const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); 1.58 + const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64); 1.59 + const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64); 1.60 + const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64); 1.61 + const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64); 1.62 + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); 1.63 + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); 1.64 + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); 1.65 + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); 1.66 + const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64); 1.67 + const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64); 1.68 + const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64); 1.69 + const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64); 1.70 + const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64); 1.71 + const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64); 1.72 + const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64); 1.73 + const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64); 1.74 + const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64); 1.75 + const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64); 1.76 + const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64); 1.77 + const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64); 1.78 + const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64); 1.79 + const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64); 1.80 + const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); 1.81 + const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); 1.82 + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); 1.83 + const __m128i kZero = _mm_set1_epi16(0); 1.84 + const __m128i kOne = _mm_set1_epi16(1); 1.85 + // Do the two transform/transpose passes 1.86 + int pass; 1.87 + for (pass = 0; pass < 2; ++pass) { 1.88 + // We process eight columns (transposed rows in second pass) at a time. 1.89 + int column_start; 1.90 + for (column_start = 0; column_start < 32; column_start += 8) { 1.91 + __m128i step1[32]; 1.92 + __m128i step2[32]; 1.93 + __m128i step3[32]; 1.94 + __m128i out[32]; 1.95 + // Stage 1 1.96 + // Note: even though all the loads below are aligned, using the aligned 1.97 + // intrinsic make the code slightly slower. 1.98 + if (0 == pass) { 1.99 + const int16_t *in = &input[column_start]; 1.100 + // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; 1.101 + // Note: the next four blocks could be in a loop. That would help the 1.102 + // instruction cache but is actually slower. 1.103 + { 1.104 + const int16_t *ina = in + 0 * str1; 1.105 + const int16_t *inb = in + 31 * str1; 1.106 + __m128i *step1a = &step1[ 0]; 1.107 + __m128i *step1b = &step1[31]; 1.108 + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); 1.109 + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); 1.110 + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); 1.111 + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); 1.112 + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); 1.113 + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); 1.114 + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); 1.115 + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); 1.116 + step1a[ 0] = _mm_add_epi16(ina0, inb0); 1.117 + step1a[ 1] = _mm_add_epi16(ina1, inb1); 1.118 + step1a[ 2] = _mm_add_epi16(ina2, inb2); 1.119 + step1a[ 3] = _mm_add_epi16(ina3, inb3); 1.120 + step1b[-3] = _mm_sub_epi16(ina3, inb3); 1.121 + step1b[-2] = _mm_sub_epi16(ina2, inb2); 1.122 + step1b[-1] = _mm_sub_epi16(ina1, inb1); 1.123 + step1b[-0] = _mm_sub_epi16(ina0, inb0); 1.124 + step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); 1.125 + step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); 1.126 + step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); 1.127 + step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); 1.128 + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); 1.129 + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); 1.130 + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); 1.131 + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); 1.132 + } 1.133 + { 1.134 + const int16_t *ina = in + 4 * str1; 1.135 + const int16_t *inb = in + 27 * str1; 1.136 + __m128i *step1a = &step1[ 4]; 1.137 + __m128i *step1b = &step1[27]; 1.138 + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); 1.139 + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); 1.140 + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); 1.141 + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); 1.142 + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); 1.143 + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); 1.144 + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); 1.145 + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); 1.146 + step1a[ 0] = _mm_add_epi16(ina0, inb0); 1.147 + step1a[ 1] = _mm_add_epi16(ina1, inb1); 1.148 + step1a[ 2] = _mm_add_epi16(ina2, inb2); 1.149 + step1a[ 3] = _mm_add_epi16(ina3, inb3); 1.150 + step1b[-3] = _mm_sub_epi16(ina3, inb3); 1.151 + step1b[-2] = _mm_sub_epi16(ina2, inb2); 1.152 + step1b[-1] = _mm_sub_epi16(ina1, inb1); 1.153 + step1b[-0] = _mm_sub_epi16(ina0, inb0); 1.154 + step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); 1.155 + step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); 1.156 + step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); 1.157 + step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); 1.158 + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); 1.159 + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); 1.160 + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); 1.161 + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); 1.162 + } 1.163 + { 1.164 + const int16_t *ina = in + 8 * str1; 1.165 + const int16_t *inb = in + 23 * str1; 1.166 + __m128i *step1a = &step1[ 8]; 1.167 + __m128i *step1b = &step1[23]; 1.168 + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); 1.169 + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); 1.170 + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); 1.171 + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); 1.172 + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); 1.173 + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); 1.174 + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); 1.175 + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); 1.176 + step1a[ 0] = _mm_add_epi16(ina0, inb0); 1.177 + step1a[ 1] = _mm_add_epi16(ina1, inb1); 1.178 + step1a[ 2] = _mm_add_epi16(ina2, inb2); 1.179 + step1a[ 3] = _mm_add_epi16(ina3, inb3); 1.180 + step1b[-3] = _mm_sub_epi16(ina3, inb3); 1.181 + step1b[-2] = _mm_sub_epi16(ina2, inb2); 1.182 + step1b[-1] = _mm_sub_epi16(ina1, inb1); 1.183 + step1b[-0] = _mm_sub_epi16(ina0, inb0); 1.184 + step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); 1.185 + step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); 1.186 + step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); 1.187 + step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); 1.188 + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); 1.189 + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); 1.190 + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); 1.191 + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); 1.192 + } 1.193 + { 1.194 + const int16_t *ina = in + 12 * str1; 1.195 + const int16_t *inb = in + 19 * str1; 1.196 + __m128i *step1a = &step1[12]; 1.197 + __m128i *step1b = &step1[19]; 1.198 + const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); 1.199 + const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); 1.200 + const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); 1.201 + const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); 1.202 + const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); 1.203 + const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); 1.204 + const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); 1.205 + const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); 1.206 + step1a[ 0] = _mm_add_epi16(ina0, inb0); 1.207 + step1a[ 1] = _mm_add_epi16(ina1, inb1); 1.208 + step1a[ 2] = _mm_add_epi16(ina2, inb2); 1.209 + step1a[ 3] = _mm_add_epi16(ina3, inb3); 1.210 + step1b[-3] = _mm_sub_epi16(ina3, inb3); 1.211 + step1b[-2] = _mm_sub_epi16(ina2, inb2); 1.212 + step1b[-1] = _mm_sub_epi16(ina1, inb1); 1.213 + step1b[-0] = _mm_sub_epi16(ina0, inb0); 1.214 + step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); 1.215 + step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); 1.216 + step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); 1.217 + step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); 1.218 + step1b[-3] = _mm_slli_epi16(step1b[-3], 2); 1.219 + step1b[-2] = _mm_slli_epi16(step1b[-2], 2); 1.220 + step1b[-1] = _mm_slli_epi16(step1b[-1], 2); 1.221 + step1b[-0] = _mm_slli_epi16(step1b[-0], 2); 1.222 + } 1.223 + } else { 1.224 + int16_t *in = &intermediate[column_start]; 1.225 + // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32]; 1.226 + // Note: using the same approach as above to have common offset is 1.227 + // counter-productive as all offsets can be calculated at compile 1.228 + // time. 1.229 + // Note: the next four blocks could be in a loop. That would help the 1.230 + // instruction cache but is actually slower. 1.231 + { 1.232 + __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32)); 1.233 + __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32)); 1.234 + __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32)); 1.235 + __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32)); 1.236 + __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32)); 1.237 + __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32)); 1.238 + __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32)); 1.239 + __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32)); 1.240 + step1[ 0] = _mm_add_epi16(in00, in31); 1.241 + step1[ 1] = _mm_add_epi16(in01, in30); 1.242 + step1[ 2] = _mm_add_epi16(in02, in29); 1.243 + step1[ 3] = _mm_add_epi16(in03, in28); 1.244 + step1[28] = _mm_sub_epi16(in03, in28); 1.245 + step1[29] = _mm_sub_epi16(in02, in29); 1.246 + step1[30] = _mm_sub_epi16(in01, in30); 1.247 + step1[31] = _mm_sub_epi16(in00, in31); 1.248 + } 1.249 + { 1.250 + __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32)); 1.251 + __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32)); 1.252 + __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32)); 1.253 + __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32)); 1.254 + __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32)); 1.255 + __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32)); 1.256 + __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32)); 1.257 + __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32)); 1.258 + step1[ 4] = _mm_add_epi16(in04, in27); 1.259 + step1[ 5] = _mm_add_epi16(in05, in26); 1.260 + step1[ 6] = _mm_add_epi16(in06, in25); 1.261 + step1[ 7] = _mm_add_epi16(in07, in24); 1.262 + step1[24] = _mm_sub_epi16(in07, in24); 1.263 + step1[25] = _mm_sub_epi16(in06, in25); 1.264 + step1[26] = _mm_sub_epi16(in05, in26); 1.265 + step1[27] = _mm_sub_epi16(in04, in27); 1.266 + } 1.267 + { 1.268 + __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32)); 1.269 + __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32)); 1.270 + __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32)); 1.271 + __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32)); 1.272 + __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32)); 1.273 + __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32)); 1.274 + __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32)); 1.275 + __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32)); 1.276 + step1[ 8] = _mm_add_epi16(in08, in23); 1.277 + step1[ 9] = _mm_add_epi16(in09, in22); 1.278 + step1[10] = _mm_add_epi16(in10, in21); 1.279 + step1[11] = _mm_add_epi16(in11, in20); 1.280 + step1[20] = _mm_sub_epi16(in11, in20); 1.281 + step1[21] = _mm_sub_epi16(in10, in21); 1.282 + step1[22] = _mm_sub_epi16(in09, in22); 1.283 + step1[23] = _mm_sub_epi16(in08, in23); 1.284 + } 1.285 + { 1.286 + __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32)); 1.287 + __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32)); 1.288 + __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32)); 1.289 + __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32)); 1.290 + __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32)); 1.291 + __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32)); 1.292 + __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32)); 1.293 + __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32)); 1.294 + step1[12] = _mm_add_epi16(in12, in19); 1.295 + step1[13] = _mm_add_epi16(in13, in18); 1.296 + step1[14] = _mm_add_epi16(in14, in17); 1.297 + step1[15] = _mm_add_epi16(in15, in16); 1.298 + step1[16] = _mm_sub_epi16(in15, in16); 1.299 + step1[17] = _mm_sub_epi16(in14, in17); 1.300 + step1[18] = _mm_sub_epi16(in13, in18); 1.301 + step1[19] = _mm_sub_epi16(in12, in19); 1.302 + } 1.303 + } 1.304 + // Stage 2 1.305 + { 1.306 + step2[ 0] = _mm_add_epi16(step1[0], step1[15]); 1.307 + step2[ 1] = _mm_add_epi16(step1[1], step1[14]); 1.308 + step2[ 2] = _mm_add_epi16(step1[2], step1[13]); 1.309 + step2[ 3] = _mm_add_epi16(step1[3], step1[12]); 1.310 + step2[ 4] = _mm_add_epi16(step1[4], step1[11]); 1.311 + step2[ 5] = _mm_add_epi16(step1[5], step1[10]); 1.312 + step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]); 1.313 + step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]); 1.314 + step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]); 1.315 + step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]); 1.316 + step2[10] = _mm_sub_epi16(step1[5], step1[10]); 1.317 + step2[11] = _mm_sub_epi16(step1[4], step1[11]); 1.318 + step2[12] = _mm_sub_epi16(step1[3], step1[12]); 1.319 + step2[13] = _mm_sub_epi16(step1[2], step1[13]); 1.320 + step2[14] = _mm_sub_epi16(step1[1], step1[14]); 1.321 + step2[15] = _mm_sub_epi16(step1[0], step1[15]); 1.322 + } 1.323 + { 1.324 + const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]); 1.325 + const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]); 1.326 + const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]); 1.327 + const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]); 1.328 + const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]); 1.329 + const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]); 1.330 + const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]); 1.331 + const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]); 1.332 + const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16); 1.333 + const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16); 1.334 + const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16); 1.335 + const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16); 1.336 + const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16); 1.337 + const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16); 1.338 + const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16); 1.339 + const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16); 1.340 + const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16); 1.341 + const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16); 1.342 + const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16); 1.343 + const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16); 1.344 + const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16); 1.345 + const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16); 1.346 + const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16); 1.347 + const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16); 1.348 + // dct_const_round_shift 1.349 + const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); 1.350 + const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); 1.351 + const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); 1.352 + const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); 1.353 + const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); 1.354 + const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); 1.355 + const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); 1.356 + const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); 1.357 + const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); 1.358 + const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); 1.359 + const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); 1.360 + const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); 1.361 + const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); 1.362 + const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); 1.363 + const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); 1.364 + const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); 1.365 + const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS); 1.366 + const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS); 1.367 + const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS); 1.368 + const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS); 1.369 + const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS); 1.370 + const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS); 1.371 + const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS); 1.372 + const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS); 1.373 + const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS); 1.374 + const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS); 1.375 + const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS); 1.376 + const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS); 1.377 + const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS); 1.378 + const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS); 1.379 + const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS); 1.380 + const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS); 1.381 + // Combine 1.382 + step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7); 1.383 + step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7); 1.384 + step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7); 1.385 + step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7); 1.386 + step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7); 1.387 + step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7); 1.388 + step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7); 1.389 + step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7); 1.390 + } 1.391 + 1.392 +#if !FDCT32x32_HIGH_PRECISION 1.393 + // dump the magnitude by half, hence the intermediate values are within 1.394 + // the range of 16 bits. 1.395 + if (1 == pass) { 1.396 + __m128i s3_00_0 = _mm_cmplt_epi16(step2[ 0], kZero); 1.397 + __m128i s3_01_0 = _mm_cmplt_epi16(step2[ 1], kZero); 1.398 + __m128i s3_02_0 = _mm_cmplt_epi16(step2[ 2], kZero); 1.399 + __m128i s3_03_0 = _mm_cmplt_epi16(step2[ 3], kZero); 1.400 + __m128i s3_04_0 = _mm_cmplt_epi16(step2[ 4], kZero); 1.401 + __m128i s3_05_0 = _mm_cmplt_epi16(step2[ 5], kZero); 1.402 + __m128i s3_06_0 = _mm_cmplt_epi16(step2[ 6], kZero); 1.403 + __m128i s3_07_0 = _mm_cmplt_epi16(step2[ 7], kZero); 1.404 + __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero); 1.405 + __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero); 1.406 + __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero); 1.407 + __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero); 1.408 + __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero); 1.409 + __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero); 1.410 + __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero); 1.411 + __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero); 1.412 + __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero); 1.413 + __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero); 1.414 + __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero); 1.415 + __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero); 1.416 + __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero); 1.417 + __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero); 1.418 + __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero); 1.419 + __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero); 1.420 + __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero); 1.421 + __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero); 1.422 + __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero); 1.423 + __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero); 1.424 + __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero); 1.425 + __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero); 1.426 + __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero); 1.427 + __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero); 1.428 + 1.429 + step2[ 0] = _mm_sub_epi16(step2[ 0], s3_00_0); 1.430 + step2[ 1] = _mm_sub_epi16(step2[ 1], s3_01_0); 1.431 + step2[ 2] = _mm_sub_epi16(step2[ 2], s3_02_0); 1.432 + step2[ 3] = _mm_sub_epi16(step2[ 3], s3_03_0); 1.433 + step2[ 4] = _mm_sub_epi16(step2[ 4], s3_04_0); 1.434 + step2[ 5] = _mm_sub_epi16(step2[ 5], s3_05_0); 1.435 + step2[ 6] = _mm_sub_epi16(step2[ 6], s3_06_0); 1.436 + step2[ 7] = _mm_sub_epi16(step2[ 7], s3_07_0); 1.437 + step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0); 1.438 + step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0); 1.439 + step2[10] = _mm_sub_epi16(step2[10], s3_10_0); 1.440 + step2[11] = _mm_sub_epi16(step2[11], s3_11_0); 1.441 + step2[12] = _mm_sub_epi16(step2[12], s3_12_0); 1.442 + step2[13] = _mm_sub_epi16(step2[13], s3_13_0); 1.443 + step2[14] = _mm_sub_epi16(step2[14], s2_14_0); 1.444 + step2[15] = _mm_sub_epi16(step2[15], s2_15_0); 1.445 + step1[16] = _mm_sub_epi16(step1[16], s3_16_0); 1.446 + step1[17] = _mm_sub_epi16(step1[17], s3_17_0); 1.447 + step1[18] = _mm_sub_epi16(step1[18], s3_18_0); 1.448 + step1[19] = _mm_sub_epi16(step1[19], s3_19_0); 1.449 + step2[20] = _mm_sub_epi16(step2[20], s3_20_0); 1.450 + step2[21] = _mm_sub_epi16(step2[21], s3_21_0); 1.451 + step2[22] = _mm_sub_epi16(step2[22], s3_22_0); 1.452 + step2[23] = _mm_sub_epi16(step2[23], s3_23_0); 1.453 + step2[24] = _mm_sub_epi16(step2[24], s3_24_0); 1.454 + step2[25] = _mm_sub_epi16(step2[25], s3_25_0); 1.455 + step2[26] = _mm_sub_epi16(step2[26], s3_26_0); 1.456 + step2[27] = _mm_sub_epi16(step2[27], s3_27_0); 1.457 + step1[28] = _mm_sub_epi16(step1[28], s3_28_0); 1.458 + step1[29] = _mm_sub_epi16(step1[29], s3_29_0); 1.459 + step1[30] = _mm_sub_epi16(step1[30], s3_30_0); 1.460 + step1[31] = _mm_sub_epi16(step1[31], s3_31_0); 1.461 + 1.462 + step2[ 0] = _mm_add_epi16(step2[ 0], kOne); 1.463 + step2[ 1] = _mm_add_epi16(step2[ 1], kOne); 1.464 + step2[ 2] = _mm_add_epi16(step2[ 2], kOne); 1.465 + step2[ 3] = _mm_add_epi16(step2[ 3], kOne); 1.466 + step2[ 4] = _mm_add_epi16(step2[ 4], kOne); 1.467 + step2[ 5] = _mm_add_epi16(step2[ 5], kOne); 1.468 + step2[ 6] = _mm_add_epi16(step2[ 6], kOne); 1.469 + step2[ 7] = _mm_add_epi16(step2[ 7], kOne); 1.470 + step2[ 8] = _mm_add_epi16(step2[ 8], kOne); 1.471 + step2[ 9] = _mm_add_epi16(step2[ 9], kOne); 1.472 + step2[10] = _mm_add_epi16(step2[10], kOne); 1.473 + step2[11] = _mm_add_epi16(step2[11], kOne); 1.474 + step2[12] = _mm_add_epi16(step2[12], kOne); 1.475 + step2[13] = _mm_add_epi16(step2[13], kOne); 1.476 + step2[14] = _mm_add_epi16(step2[14], kOne); 1.477 + step2[15] = _mm_add_epi16(step2[15], kOne); 1.478 + step1[16] = _mm_add_epi16(step1[16], kOne); 1.479 + step1[17] = _mm_add_epi16(step1[17], kOne); 1.480 + step1[18] = _mm_add_epi16(step1[18], kOne); 1.481 + step1[19] = _mm_add_epi16(step1[19], kOne); 1.482 + step2[20] = _mm_add_epi16(step2[20], kOne); 1.483 + step2[21] = _mm_add_epi16(step2[21], kOne); 1.484 + step2[22] = _mm_add_epi16(step2[22], kOne); 1.485 + step2[23] = _mm_add_epi16(step2[23], kOne); 1.486 + step2[24] = _mm_add_epi16(step2[24], kOne); 1.487 + step2[25] = _mm_add_epi16(step2[25], kOne); 1.488 + step2[26] = _mm_add_epi16(step2[26], kOne); 1.489 + step2[27] = _mm_add_epi16(step2[27], kOne); 1.490 + step1[28] = _mm_add_epi16(step1[28], kOne); 1.491 + step1[29] = _mm_add_epi16(step1[29], kOne); 1.492 + step1[30] = _mm_add_epi16(step1[30], kOne); 1.493 + step1[31] = _mm_add_epi16(step1[31], kOne); 1.494 + 1.495 + step2[ 0] = _mm_srai_epi16(step2[ 0], 2); 1.496 + step2[ 1] = _mm_srai_epi16(step2[ 1], 2); 1.497 + step2[ 2] = _mm_srai_epi16(step2[ 2], 2); 1.498 + step2[ 3] = _mm_srai_epi16(step2[ 3], 2); 1.499 + step2[ 4] = _mm_srai_epi16(step2[ 4], 2); 1.500 + step2[ 5] = _mm_srai_epi16(step2[ 5], 2); 1.501 + step2[ 6] = _mm_srai_epi16(step2[ 6], 2); 1.502 + step2[ 7] = _mm_srai_epi16(step2[ 7], 2); 1.503 + step2[ 8] = _mm_srai_epi16(step2[ 8], 2); 1.504 + step2[ 9] = _mm_srai_epi16(step2[ 9], 2); 1.505 + step2[10] = _mm_srai_epi16(step2[10], 2); 1.506 + step2[11] = _mm_srai_epi16(step2[11], 2); 1.507 + step2[12] = _mm_srai_epi16(step2[12], 2); 1.508 + step2[13] = _mm_srai_epi16(step2[13], 2); 1.509 + step2[14] = _mm_srai_epi16(step2[14], 2); 1.510 + step2[15] = _mm_srai_epi16(step2[15], 2); 1.511 + step1[16] = _mm_srai_epi16(step1[16], 2); 1.512 + step1[17] = _mm_srai_epi16(step1[17], 2); 1.513 + step1[18] = _mm_srai_epi16(step1[18], 2); 1.514 + step1[19] = _mm_srai_epi16(step1[19], 2); 1.515 + step2[20] = _mm_srai_epi16(step2[20], 2); 1.516 + step2[21] = _mm_srai_epi16(step2[21], 2); 1.517 + step2[22] = _mm_srai_epi16(step2[22], 2); 1.518 + step2[23] = _mm_srai_epi16(step2[23], 2); 1.519 + step2[24] = _mm_srai_epi16(step2[24], 2); 1.520 + step2[25] = _mm_srai_epi16(step2[25], 2); 1.521 + step2[26] = _mm_srai_epi16(step2[26], 2); 1.522 + step2[27] = _mm_srai_epi16(step2[27], 2); 1.523 + step1[28] = _mm_srai_epi16(step1[28], 2); 1.524 + step1[29] = _mm_srai_epi16(step1[29], 2); 1.525 + step1[30] = _mm_srai_epi16(step1[30], 2); 1.526 + step1[31] = _mm_srai_epi16(step1[31], 2); 1.527 + } 1.528 +#endif 1.529 + 1.530 +#if FDCT32x32_HIGH_PRECISION 1.531 + if (pass == 0) { 1.532 +#endif 1.533 + // Stage 3 1.534 + { 1.535 + step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]); 1.536 + step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]); 1.537 + step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]); 1.538 + step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]); 1.539 + step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]); 1.540 + step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]); 1.541 + step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]); 1.542 + step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]); 1.543 + } 1.544 + { 1.545 + const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); 1.546 + const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); 1.547 + const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); 1.548 + const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); 1.549 + const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); 1.550 + const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); 1.551 + const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); 1.552 + const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); 1.553 + const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); 1.554 + const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); 1.555 + const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); 1.556 + const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); 1.557 + // dct_const_round_shift 1.558 + const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); 1.559 + const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); 1.560 + const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); 1.561 + const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); 1.562 + const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); 1.563 + const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); 1.564 + const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); 1.565 + const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); 1.566 + const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); 1.567 + const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); 1.568 + const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); 1.569 + const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); 1.570 + const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); 1.571 + const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); 1.572 + const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); 1.573 + const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); 1.574 + // Combine 1.575 + step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7); 1.576 + step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7); 1.577 + step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); 1.578 + step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); 1.579 + } 1.580 + { 1.581 + step3[16] = _mm_add_epi16(step2[23], step1[16]); 1.582 + step3[17] = _mm_add_epi16(step2[22], step1[17]); 1.583 + step3[18] = _mm_add_epi16(step2[21], step1[18]); 1.584 + step3[19] = _mm_add_epi16(step2[20], step1[19]); 1.585 + step3[20] = _mm_sub_epi16(step1[19], step2[20]); 1.586 + step3[21] = _mm_sub_epi16(step1[18], step2[21]); 1.587 + step3[22] = _mm_sub_epi16(step1[17], step2[22]); 1.588 + step3[23] = _mm_sub_epi16(step1[16], step2[23]); 1.589 + step3[24] = _mm_sub_epi16(step1[31], step2[24]); 1.590 + step3[25] = _mm_sub_epi16(step1[30], step2[25]); 1.591 + step3[26] = _mm_sub_epi16(step1[29], step2[26]); 1.592 + step3[27] = _mm_sub_epi16(step1[28], step2[27]); 1.593 + step3[28] = _mm_add_epi16(step2[27], step1[28]); 1.594 + step3[29] = _mm_add_epi16(step2[26], step1[29]); 1.595 + step3[30] = _mm_add_epi16(step2[25], step1[30]); 1.596 + step3[31] = _mm_add_epi16(step2[24], step1[31]); 1.597 + } 1.598 + 1.599 + // Stage 4 1.600 + { 1.601 + step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]); 1.602 + step1[ 1] = _mm_add_epi16(step3[ 2], step3[ 1]); 1.603 + step1[ 2] = _mm_sub_epi16(step3[ 1], step3[ 2]); 1.604 + step1[ 3] = _mm_sub_epi16(step3[ 0], step3[ 3]); 1.605 + step1[ 8] = _mm_add_epi16(step3[11], step2[ 8]); 1.606 + step1[ 9] = _mm_add_epi16(step3[10], step2[ 9]); 1.607 + step1[10] = _mm_sub_epi16(step2[ 9], step3[10]); 1.608 + step1[11] = _mm_sub_epi16(step2[ 8], step3[11]); 1.609 + step1[12] = _mm_sub_epi16(step2[15], step3[12]); 1.610 + step1[13] = _mm_sub_epi16(step2[14], step3[13]); 1.611 + step1[14] = _mm_add_epi16(step3[13], step2[14]); 1.612 + step1[15] = _mm_add_epi16(step3[12], step2[15]); 1.613 + } 1.614 + { 1.615 + const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); 1.616 + const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); 1.617 + const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); 1.618 + const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); 1.619 + const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); 1.620 + const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); 1.621 + // dct_const_round_shift 1.622 + const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); 1.623 + const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); 1.624 + const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); 1.625 + const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); 1.626 + const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); 1.627 + const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); 1.628 + const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); 1.629 + const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); 1.630 + // Combine 1.631 + step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); 1.632 + step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); 1.633 + } 1.634 + { 1.635 + const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); 1.636 + const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); 1.637 + const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); 1.638 + const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); 1.639 + const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); 1.640 + const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); 1.641 + const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); 1.642 + const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); 1.643 + const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); 1.644 + const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); 1.645 + const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); 1.646 + const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); 1.647 + const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); 1.648 + const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); 1.649 + const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); 1.650 + const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); 1.651 + const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); 1.652 + const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); 1.653 + const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); 1.654 + const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); 1.655 + const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); 1.656 + const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); 1.657 + const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); 1.658 + const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); 1.659 + // dct_const_round_shift 1.660 + const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); 1.661 + const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); 1.662 + const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); 1.663 + const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); 1.664 + const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); 1.665 + const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); 1.666 + const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); 1.667 + const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); 1.668 + const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); 1.669 + const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); 1.670 + const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); 1.671 + const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); 1.672 + const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); 1.673 + const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); 1.674 + const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); 1.675 + const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); 1.676 + const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); 1.677 + const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); 1.678 + const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); 1.679 + const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); 1.680 + const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); 1.681 + const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); 1.682 + const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); 1.683 + const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); 1.684 + const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); 1.685 + const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); 1.686 + const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); 1.687 + const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); 1.688 + const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); 1.689 + const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); 1.690 + const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); 1.691 + const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); 1.692 + // Combine 1.693 + step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7); 1.694 + step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7); 1.695 + step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7); 1.696 + step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7); 1.697 + step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7); 1.698 + step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7); 1.699 + step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); 1.700 + step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); 1.701 + } 1.702 + // Stage 5 1.703 + { 1.704 + step2[4] = _mm_add_epi16(step1[5], step3[4]); 1.705 + step2[5] = _mm_sub_epi16(step3[4], step1[5]); 1.706 + step2[6] = _mm_sub_epi16(step3[7], step1[6]); 1.707 + step2[7] = _mm_add_epi16(step1[6], step3[7]); 1.708 + } 1.709 + { 1.710 + const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]); 1.711 + const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]); 1.712 + const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]); 1.713 + const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]); 1.714 + const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16); 1.715 + const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16); 1.716 + const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16); 1.717 + const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16); 1.718 + const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08); 1.719 + const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08); 1.720 + const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24); 1.721 + const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24); 1.722 + // dct_const_round_shift 1.723 + const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); 1.724 + const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); 1.725 + const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); 1.726 + const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); 1.727 + const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); 1.728 + const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); 1.729 + const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); 1.730 + const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); 1.731 + const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS); 1.732 + const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS); 1.733 + const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS); 1.734 + const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS); 1.735 + const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS); 1.736 + const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS); 1.737 + const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS); 1.738 + const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS); 1.739 + // Combine 1.740 + out[ 0] = _mm_packs_epi32(out_00_6, out_00_7); 1.741 + out[16] = _mm_packs_epi32(out_16_6, out_16_7); 1.742 + out[ 8] = _mm_packs_epi32(out_08_6, out_08_7); 1.743 + out[24] = _mm_packs_epi32(out_24_6, out_24_7); 1.744 + } 1.745 + { 1.746 + const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]); 1.747 + const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]); 1.748 + const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]); 1.749 + const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]); 1.750 + const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24); 1.751 + const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24); 1.752 + const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08); 1.753 + const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08); 1.754 + const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24); 1.755 + const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24); 1.756 + const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08); 1.757 + const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08); 1.758 + // dct_const_round_shift 1.759 + const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); 1.760 + const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); 1.761 + const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); 1.762 + const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); 1.763 + const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); 1.764 + const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); 1.765 + const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); 1.766 + const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); 1.767 + const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS); 1.768 + const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS); 1.769 + const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS); 1.770 + const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS); 1.771 + const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS); 1.772 + const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS); 1.773 + const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS); 1.774 + const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS); 1.775 + // Combine 1.776 + step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7); 1.777 + step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7); 1.778 + step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); 1.779 + step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); 1.780 + } 1.781 + { 1.782 + step2[16] = _mm_add_epi16(step1[19], step3[16]); 1.783 + step2[17] = _mm_add_epi16(step1[18], step3[17]); 1.784 + step2[18] = _mm_sub_epi16(step3[17], step1[18]); 1.785 + step2[19] = _mm_sub_epi16(step3[16], step1[19]); 1.786 + step2[20] = _mm_sub_epi16(step3[23], step1[20]); 1.787 + step2[21] = _mm_sub_epi16(step3[22], step1[21]); 1.788 + step2[22] = _mm_add_epi16(step1[21], step3[22]); 1.789 + step2[23] = _mm_add_epi16(step1[20], step3[23]); 1.790 + step2[24] = _mm_add_epi16(step1[27], step3[24]); 1.791 + step2[25] = _mm_add_epi16(step1[26], step3[25]); 1.792 + step2[26] = _mm_sub_epi16(step3[25], step1[26]); 1.793 + step2[27] = _mm_sub_epi16(step3[24], step1[27]); 1.794 + step2[28] = _mm_sub_epi16(step3[31], step1[28]); 1.795 + step2[29] = _mm_sub_epi16(step3[30], step1[29]); 1.796 + step2[30] = _mm_add_epi16(step1[29], step3[30]); 1.797 + step2[31] = _mm_add_epi16(step1[28], step3[31]); 1.798 + } 1.799 + // Stage 6 1.800 + { 1.801 + const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]); 1.802 + const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]); 1.803 + const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]); 1.804 + const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]); 1.805 + const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]); 1.806 + const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]); 1.807 + const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]); 1.808 + const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]); 1.809 + const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04); 1.810 + const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04); 1.811 + const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20); 1.812 + const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20); 1.813 + const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12); 1.814 + const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12); 1.815 + const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28); 1.816 + const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28); 1.817 + // dct_const_round_shift 1.818 + const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); 1.819 + const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); 1.820 + const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); 1.821 + const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); 1.822 + const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); 1.823 + const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); 1.824 + const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); 1.825 + const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); 1.826 + const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS); 1.827 + const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS); 1.828 + const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS); 1.829 + const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS); 1.830 + const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS); 1.831 + const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS); 1.832 + const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS); 1.833 + const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS); 1.834 + // Combine 1.835 + out[ 4] = _mm_packs_epi32(out_04_6, out_04_7); 1.836 + out[20] = _mm_packs_epi32(out_20_6, out_20_7); 1.837 + out[12] = _mm_packs_epi32(out_12_6, out_12_7); 1.838 + out[28] = _mm_packs_epi32(out_28_6, out_28_7); 1.839 + } 1.840 + { 1.841 + step3[ 8] = _mm_add_epi16(step2[ 9], step1[ 8]); 1.842 + step3[ 9] = _mm_sub_epi16(step1[ 8], step2[ 9]); 1.843 + step3[10] = _mm_sub_epi16(step1[11], step2[10]); 1.844 + step3[11] = _mm_add_epi16(step2[10], step1[11]); 1.845 + step3[12] = _mm_add_epi16(step2[13], step1[12]); 1.846 + step3[13] = _mm_sub_epi16(step1[12], step2[13]); 1.847 + step3[14] = _mm_sub_epi16(step1[15], step2[14]); 1.848 + step3[15] = _mm_add_epi16(step2[14], step1[15]); 1.849 + } 1.850 + { 1.851 + const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]); 1.852 + const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]); 1.853 + const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]); 1.854 + const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]); 1.855 + const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]); 1.856 + const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]); 1.857 + const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]); 1.858 + const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]); 1.859 + const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28); 1.860 + const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28); 1.861 + const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04); 1.862 + const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04); 1.863 + const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12); 1.864 + const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12); 1.865 + const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20); 1.866 + const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20); 1.867 + const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12); 1.868 + const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12); 1.869 + const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20); 1.870 + const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20); 1.871 + const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28); 1.872 + const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28); 1.873 + const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04); 1.874 + const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04); 1.875 + // dct_const_round_shift 1.876 + const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); 1.877 + const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); 1.878 + const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); 1.879 + const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); 1.880 + const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); 1.881 + const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); 1.882 + const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); 1.883 + const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); 1.884 + const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS); 1.885 + const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS); 1.886 + const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS); 1.887 + const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS); 1.888 + const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS); 1.889 + const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS); 1.890 + const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS); 1.891 + const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS); 1.892 + const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); 1.893 + const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); 1.894 + const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); 1.895 + const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); 1.896 + const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); 1.897 + const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); 1.898 + const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); 1.899 + const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); 1.900 + const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS); 1.901 + const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS); 1.902 + const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS); 1.903 + const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS); 1.904 + const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS); 1.905 + const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS); 1.906 + const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS); 1.907 + const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS); 1.908 + // Combine 1.909 + step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7); 1.910 + step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7); 1.911 + step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7); 1.912 + step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7); 1.913 + // Combine 1.914 + step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7); 1.915 + step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7); 1.916 + step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); 1.917 + step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); 1.918 + } 1.919 + // Stage 7 1.920 + { 1.921 + const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]); 1.922 + const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]); 1.923 + const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]); 1.924 + const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]); 1.925 + const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]); 1.926 + const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]); 1.927 + const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]); 1.928 + const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]); 1.929 + const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02); 1.930 + const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02); 1.931 + const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18); 1.932 + const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18); 1.933 + const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10); 1.934 + const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10); 1.935 + const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26); 1.936 + const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26); 1.937 + const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06); 1.938 + const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06); 1.939 + const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22); 1.940 + const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22); 1.941 + const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14); 1.942 + const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14); 1.943 + const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30); 1.944 + const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30); 1.945 + // dct_const_round_shift 1.946 + const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); 1.947 + const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); 1.948 + const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); 1.949 + const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); 1.950 + const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); 1.951 + const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); 1.952 + const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); 1.953 + const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); 1.954 + const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); 1.955 + const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); 1.956 + const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); 1.957 + const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); 1.958 + const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); 1.959 + const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); 1.960 + const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); 1.961 + const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); 1.962 + const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS); 1.963 + const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS); 1.964 + const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS); 1.965 + const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS); 1.966 + const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS); 1.967 + const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS); 1.968 + const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS); 1.969 + const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS); 1.970 + const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS); 1.971 + const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS); 1.972 + const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS); 1.973 + const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS); 1.974 + const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS); 1.975 + const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS); 1.976 + const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS); 1.977 + const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS); 1.978 + // Combine 1.979 + out[ 2] = _mm_packs_epi32(out_02_6, out_02_7); 1.980 + out[18] = _mm_packs_epi32(out_18_6, out_18_7); 1.981 + out[10] = _mm_packs_epi32(out_10_6, out_10_7); 1.982 + out[26] = _mm_packs_epi32(out_26_6, out_26_7); 1.983 + out[ 6] = _mm_packs_epi32(out_06_6, out_06_7); 1.984 + out[22] = _mm_packs_epi32(out_22_6, out_22_7); 1.985 + out[14] = _mm_packs_epi32(out_14_6, out_14_7); 1.986 + out[30] = _mm_packs_epi32(out_30_6, out_30_7); 1.987 + } 1.988 + { 1.989 + step1[16] = _mm_add_epi16(step3[17], step2[16]); 1.990 + step1[17] = _mm_sub_epi16(step2[16], step3[17]); 1.991 + step1[18] = _mm_sub_epi16(step2[19], step3[18]); 1.992 + step1[19] = _mm_add_epi16(step3[18], step2[19]); 1.993 + step1[20] = _mm_add_epi16(step3[21], step2[20]); 1.994 + step1[21] = _mm_sub_epi16(step2[20], step3[21]); 1.995 + step1[22] = _mm_sub_epi16(step2[23], step3[22]); 1.996 + step1[23] = _mm_add_epi16(step3[22], step2[23]); 1.997 + step1[24] = _mm_add_epi16(step3[25], step2[24]); 1.998 + step1[25] = _mm_sub_epi16(step2[24], step3[25]); 1.999 + step1[26] = _mm_sub_epi16(step2[27], step3[26]); 1.1000 + step1[27] = _mm_add_epi16(step3[26], step2[27]); 1.1001 + step1[28] = _mm_add_epi16(step3[29], step2[28]); 1.1002 + step1[29] = _mm_sub_epi16(step2[28], step3[29]); 1.1003 + step1[30] = _mm_sub_epi16(step2[31], step3[30]); 1.1004 + step1[31] = _mm_add_epi16(step3[30], step2[31]); 1.1005 + } 1.1006 + // Final stage --- outputs indices are bit-reversed. 1.1007 + { 1.1008 + const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]); 1.1009 + const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]); 1.1010 + const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]); 1.1011 + const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]); 1.1012 + const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]); 1.1013 + const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]); 1.1014 + const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]); 1.1015 + const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]); 1.1016 + const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01); 1.1017 + const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01); 1.1018 + const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17); 1.1019 + const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17); 1.1020 + const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09); 1.1021 + const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09); 1.1022 + const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25); 1.1023 + const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25); 1.1024 + const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07); 1.1025 + const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07); 1.1026 + const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23); 1.1027 + const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23); 1.1028 + const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15); 1.1029 + const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15); 1.1030 + const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31); 1.1031 + const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31); 1.1032 + // dct_const_round_shift 1.1033 + const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); 1.1034 + const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); 1.1035 + const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); 1.1036 + const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); 1.1037 + const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); 1.1038 + const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); 1.1039 + const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); 1.1040 + const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); 1.1041 + const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); 1.1042 + const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); 1.1043 + const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); 1.1044 + const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); 1.1045 + const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); 1.1046 + const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); 1.1047 + const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); 1.1048 + const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); 1.1049 + const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS); 1.1050 + const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS); 1.1051 + const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS); 1.1052 + const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS); 1.1053 + const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS); 1.1054 + const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS); 1.1055 + const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS); 1.1056 + const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS); 1.1057 + const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS); 1.1058 + const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS); 1.1059 + const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS); 1.1060 + const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS); 1.1061 + const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS); 1.1062 + const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS); 1.1063 + const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS); 1.1064 + const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS); 1.1065 + // Combine 1.1066 + out[ 1] = _mm_packs_epi32(out_01_6, out_01_7); 1.1067 + out[17] = _mm_packs_epi32(out_17_6, out_17_7); 1.1068 + out[ 9] = _mm_packs_epi32(out_09_6, out_09_7); 1.1069 + out[25] = _mm_packs_epi32(out_25_6, out_25_7); 1.1070 + out[ 7] = _mm_packs_epi32(out_07_6, out_07_7); 1.1071 + out[23] = _mm_packs_epi32(out_23_6, out_23_7); 1.1072 + out[15] = _mm_packs_epi32(out_15_6, out_15_7); 1.1073 + out[31] = _mm_packs_epi32(out_31_6, out_31_7); 1.1074 + } 1.1075 + { 1.1076 + const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]); 1.1077 + const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]); 1.1078 + const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]); 1.1079 + const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]); 1.1080 + const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]); 1.1081 + const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]); 1.1082 + const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]); 1.1083 + const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]); 1.1084 + const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05); 1.1085 + const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05); 1.1086 + const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21); 1.1087 + const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21); 1.1088 + const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13); 1.1089 + const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13); 1.1090 + const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29); 1.1091 + const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29); 1.1092 + const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03); 1.1093 + const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03); 1.1094 + const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19); 1.1095 + const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19); 1.1096 + const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11); 1.1097 + const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11); 1.1098 + const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27); 1.1099 + const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27); 1.1100 + // dct_const_round_shift 1.1101 + const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); 1.1102 + const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); 1.1103 + const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); 1.1104 + const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); 1.1105 + const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); 1.1106 + const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); 1.1107 + const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); 1.1108 + const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); 1.1109 + const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); 1.1110 + const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); 1.1111 + const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); 1.1112 + const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); 1.1113 + const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); 1.1114 + const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); 1.1115 + const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); 1.1116 + const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); 1.1117 + const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS); 1.1118 + const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS); 1.1119 + const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS); 1.1120 + const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS); 1.1121 + const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS); 1.1122 + const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS); 1.1123 + const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS); 1.1124 + const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS); 1.1125 + const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS); 1.1126 + const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS); 1.1127 + const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS); 1.1128 + const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS); 1.1129 + const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS); 1.1130 + const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS); 1.1131 + const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS); 1.1132 + const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS); 1.1133 + // Combine 1.1134 + out[ 5] = _mm_packs_epi32(out_05_6, out_05_7); 1.1135 + out[21] = _mm_packs_epi32(out_21_6, out_21_7); 1.1136 + out[13] = _mm_packs_epi32(out_13_6, out_13_7); 1.1137 + out[29] = _mm_packs_epi32(out_29_6, out_29_7); 1.1138 + out[ 3] = _mm_packs_epi32(out_03_6, out_03_7); 1.1139 + out[19] = _mm_packs_epi32(out_19_6, out_19_7); 1.1140 + out[11] = _mm_packs_epi32(out_11_6, out_11_7); 1.1141 + out[27] = _mm_packs_epi32(out_27_6, out_27_7); 1.1142 + } 1.1143 +#if FDCT32x32_HIGH_PRECISION 1.1144 + } else { 1.1145 + __m128i lstep1[64], lstep2[64], lstep3[64]; 1.1146 + __m128i u[32], v[32], sign[16]; 1.1147 + const __m128i K32One = _mm_set_epi32(1, 1, 1, 1); 1.1148 + // start using 32-bit operations 1.1149 + // stage 3 1.1150 + { 1.1151 + // expanding to 32-bit length priori to addition operations 1.1152 + lstep2[ 0] = _mm_unpacklo_epi16(step2[ 0], kZero); 1.1153 + lstep2[ 1] = _mm_unpackhi_epi16(step2[ 0], kZero); 1.1154 + lstep2[ 2] = _mm_unpacklo_epi16(step2[ 1], kZero); 1.1155 + lstep2[ 3] = _mm_unpackhi_epi16(step2[ 1], kZero); 1.1156 + lstep2[ 4] = _mm_unpacklo_epi16(step2[ 2], kZero); 1.1157 + lstep2[ 5] = _mm_unpackhi_epi16(step2[ 2], kZero); 1.1158 + lstep2[ 6] = _mm_unpacklo_epi16(step2[ 3], kZero); 1.1159 + lstep2[ 7] = _mm_unpackhi_epi16(step2[ 3], kZero); 1.1160 + lstep2[ 8] = _mm_unpacklo_epi16(step2[ 4], kZero); 1.1161 + lstep2[ 9] = _mm_unpackhi_epi16(step2[ 4], kZero); 1.1162 + lstep2[10] = _mm_unpacklo_epi16(step2[ 5], kZero); 1.1163 + lstep2[11] = _mm_unpackhi_epi16(step2[ 5], kZero); 1.1164 + lstep2[12] = _mm_unpacklo_epi16(step2[ 6], kZero); 1.1165 + lstep2[13] = _mm_unpackhi_epi16(step2[ 6], kZero); 1.1166 + lstep2[14] = _mm_unpacklo_epi16(step2[ 7], kZero); 1.1167 + lstep2[15] = _mm_unpackhi_epi16(step2[ 7], kZero); 1.1168 + lstep2[ 0] = _mm_madd_epi16(lstep2[ 0], kOne); 1.1169 + lstep2[ 1] = _mm_madd_epi16(lstep2[ 1], kOne); 1.1170 + lstep2[ 2] = _mm_madd_epi16(lstep2[ 2], kOne); 1.1171 + lstep2[ 3] = _mm_madd_epi16(lstep2[ 3], kOne); 1.1172 + lstep2[ 4] = _mm_madd_epi16(lstep2[ 4], kOne); 1.1173 + lstep2[ 5] = _mm_madd_epi16(lstep2[ 5], kOne); 1.1174 + lstep2[ 6] = _mm_madd_epi16(lstep2[ 6], kOne); 1.1175 + lstep2[ 7] = _mm_madd_epi16(lstep2[ 7], kOne); 1.1176 + lstep2[ 8] = _mm_madd_epi16(lstep2[ 8], kOne); 1.1177 + lstep2[ 9] = _mm_madd_epi16(lstep2[ 9], kOne); 1.1178 + lstep2[10] = _mm_madd_epi16(lstep2[10], kOne); 1.1179 + lstep2[11] = _mm_madd_epi16(lstep2[11], kOne); 1.1180 + lstep2[12] = _mm_madd_epi16(lstep2[12], kOne); 1.1181 + lstep2[13] = _mm_madd_epi16(lstep2[13], kOne); 1.1182 + lstep2[14] = _mm_madd_epi16(lstep2[14], kOne); 1.1183 + lstep2[15] = _mm_madd_epi16(lstep2[15], kOne); 1.1184 + 1.1185 + lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]); 1.1186 + lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]); 1.1187 + lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]); 1.1188 + lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]); 1.1189 + lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]); 1.1190 + lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]); 1.1191 + lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]); 1.1192 + lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]); 1.1193 + lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]); 1.1194 + lstep3[ 9] = _mm_sub_epi32(lstep2[ 7], lstep2[ 9]); 1.1195 + lstep3[10] = _mm_sub_epi32(lstep2[ 4], lstep2[10]); 1.1196 + lstep3[11] = _mm_sub_epi32(lstep2[ 5], lstep2[11]); 1.1197 + lstep3[12] = _mm_sub_epi32(lstep2[ 2], lstep2[12]); 1.1198 + lstep3[13] = _mm_sub_epi32(lstep2[ 3], lstep2[13]); 1.1199 + lstep3[14] = _mm_sub_epi32(lstep2[ 0], lstep2[14]); 1.1200 + lstep3[15] = _mm_sub_epi32(lstep2[ 1], lstep2[15]); 1.1201 + } 1.1202 + { 1.1203 + const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); 1.1204 + const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); 1.1205 + const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); 1.1206 + const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); 1.1207 + const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); 1.1208 + const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); 1.1209 + const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); 1.1210 + const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); 1.1211 + const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); 1.1212 + const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); 1.1213 + const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); 1.1214 + const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); 1.1215 + // dct_const_round_shift 1.1216 + const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); 1.1217 + const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); 1.1218 + const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); 1.1219 + const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); 1.1220 + const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); 1.1221 + const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); 1.1222 + const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); 1.1223 + const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); 1.1224 + lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); 1.1225 + lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); 1.1226 + lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); 1.1227 + lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); 1.1228 + lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); 1.1229 + lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); 1.1230 + lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); 1.1231 + lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); 1.1232 + } 1.1233 + { 1.1234 + lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero); 1.1235 + lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero); 1.1236 + lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero); 1.1237 + lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero); 1.1238 + lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero); 1.1239 + lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero); 1.1240 + lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero); 1.1241 + lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero); 1.1242 + lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero); 1.1243 + lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero); 1.1244 + lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero); 1.1245 + lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero); 1.1246 + lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero); 1.1247 + lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero); 1.1248 + lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero); 1.1249 + lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero); 1.1250 + lstep2[40] = _mm_madd_epi16(lstep2[40], kOne); 1.1251 + lstep2[41] = _mm_madd_epi16(lstep2[41], kOne); 1.1252 + lstep2[42] = _mm_madd_epi16(lstep2[42], kOne); 1.1253 + lstep2[43] = _mm_madd_epi16(lstep2[43], kOne); 1.1254 + lstep2[44] = _mm_madd_epi16(lstep2[44], kOne); 1.1255 + lstep2[45] = _mm_madd_epi16(lstep2[45], kOne); 1.1256 + lstep2[46] = _mm_madd_epi16(lstep2[46], kOne); 1.1257 + lstep2[47] = _mm_madd_epi16(lstep2[47], kOne); 1.1258 + lstep2[48] = _mm_madd_epi16(lstep2[48], kOne); 1.1259 + lstep2[49] = _mm_madd_epi16(lstep2[49], kOne); 1.1260 + lstep2[50] = _mm_madd_epi16(lstep2[50], kOne); 1.1261 + lstep2[51] = _mm_madd_epi16(lstep2[51], kOne); 1.1262 + lstep2[52] = _mm_madd_epi16(lstep2[52], kOne); 1.1263 + lstep2[53] = _mm_madd_epi16(lstep2[53], kOne); 1.1264 + lstep2[54] = _mm_madd_epi16(lstep2[54], kOne); 1.1265 + lstep2[55] = _mm_madd_epi16(lstep2[55], kOne); 1.1266 + 1.1267 + lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero); 1.1268 + lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero); 1.1269 + lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero); 1.1270 + lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero); 1.1271 + lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero); 1.1272 + lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero); 1.1273 + lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero); 1.1274 + lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero); 1.1275 + lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero); 1.1276 + lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero); 1.1277 + lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero); 1.1278 + lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero); 1.1279 + lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero); 1.1280 + lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero); 1.1281 + lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero); 1.1282 + lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero); 1.1283 + lstep1[32] = _mm_madd_epi16(lstep1[32], kOne); 1.1284 + lstep1[33] = _mm_madd_epi16(lstep1[33], kOne); 1.1285 + lstep1[34] = _mm_madd_epi16(lstep1[34], kOne); 1.1286 + lstep1[35] = _mm_madd_epi16(lstep1[35], kOne); 1.1287 + lstep1[36] = _mm_madd_epi16(lstep1[36], kOne); 1.1288 + lstep1[37] = _mm_madd_epi16(lstep1[37], kOne); 1.1289 + lstep1[38] = _mm_madd_epi16(lstep1[38], kOne); 1.1290 + lstep1[39] = _mm_madd_epi16(lstep1[39], kOne); 1.1291 + lstep1[56] = _mm_madd_epi16(lstep1[56], kOne); 1.1292 + lstep1[57] = _mm_madd_epi16(lstep1[57], kOne); 1.1293 + lstep1[58] = _mm_madd_epi16(lstep1[58], kOne); 1.1294 + lstep1[59] = _mm_madd_epi16(lstep1[59], kOne); 1.1295 + lstep1[60] = _mm_madd_epi16(lstep1[60], kOne); 1.1296 + lstep1[61] = _mm_madd_epi16(lstep1[61], kOne); 1.1297 + lstep1[62] = _mm_madd_epi16(lstep1[62], kOne); 1.1298 + lstep1[63] = _mm_madd_epi16(lstep1[63], kOne); 1.1299 + 1.1300 + lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]); 1.1301 + lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]); 1.1302 + 1.1303 + lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]); 1.1304 + lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]); 1.1305 + lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]); 1.1306 + lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]); 1.1307 + lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]); 1.1308 + lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]); 1.1309 + lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]); 1.1310 + lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]); 1.1311 + lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]); 1.1312 + lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]); 1.1313 + lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]); 1.1314 + lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]); 1.1315 + lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]); 1.1316 + lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]); 1.1317 + lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]); 1.1318 + lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]); 1.1319 + lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]); 1.1320 + lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]); 1.1321 + lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]); 1.1322 + lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]); 1.1323 + lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]); 1.1324 + lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]); 1.1325 + lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]); 1.1326 + lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]); 1.1327 + lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]); 1.1328 + lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]); 1.1329 + lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]); 1.1330 + lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]); 1.1331 + lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]); 1.1332 + lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]); 1.1333 + } 1.1334 + 1.1335 + // stage 4 1.1336 + { 1.1337 + // expanding to 32-bit length priori to addition operations 1.1338 + lstep2[16] = _mm_unpacklo_epi16(step2[ 8], kZero); 1.1339 + lstep2[17] = _mm_unpackhi_epi16(step2[ 8], kZero); 1.1340 + lstep2[18] = _mm_unpacklo_epi16(step2[ 9], kZero); 1.1341 + lstep2[19] = _mm_unpackhi_epi16(step2[ 9], kZero); 1.1342 + lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero); 1.1343 + lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero); 1.1344 + lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero); 1.1345 + lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero); 1.1346 + lstep2[16] = _mm_madd_epi16(lstep2[16], kOne); 1.1347 + lstep2[17] = _mm_madd_epi16(lstep2[17], kOne); 1.1348 + lstep2[18] = _mm_madd_epi16(lstep2[18], kOne); 1.1349 + lstep2[19] = _mm_madd_epi16(lstep2[19], kOne); 1.1350 + lstep2[28] = _mm_madd_epi16(lstep2[28], kOne); 1.1351 + lstep2[29] = _mm_madd_epi16(lstep2[29], kOne); 1.1352 + lstep2[30] = _mm_madd_epi16(lstep2[30], kOne); 1.1353 + lstep2[31] = _mm_madd_epi16(lstep2[31], kOne); 1.1354 + 1.1355 + lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]); 1.1356 + lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]); 1.1357 + lstep1[ 2] = _mm_add_epi32(lstep3[ 4], lstep3[ 2]); 1.1358 + lstep1[ 3] = _mm_add_epi32(lstep3[ 5], lstep3[ 3]); 1.1359 + lstep1[ 4] = _mm_sub_epi32(lstep3[ 2], lstep3[ 4]); 1.1360 + lstep1[ 5] = _mm_sub_epi32(lstep3[ 3], lstep3[ 5]); 1.1361 + lstep1[ 6] = _mm_sub_epi32(lstep3[ 0], lstep3[ 6]); 1.1362 + lstep1[ 7] = _mm_sub_epi32(lstep3[ 1], lstep3[ 7]); 1.1363 + lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]); 1.1364 + lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]); 1.1365 + lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]); 1.1366 + lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]); 1.1367 + lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]); 1.1368 + lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]); 1.1369 + lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]); 1.1370 + lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]); 1.1371 + lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]); 1.1372 + lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]); 1.1373 + lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]); 1.1374 + lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]); 1.1375 + lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]); 1.1376 + lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]); 1.1377 + lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]); 1.1378 + lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]); 1.1379 + } 1.1380 + { 1.1381 + // to be continued... 1.1382 + // 1.1383 + const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); 1.1384 + const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); 1.1385 + 1.1386 + u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]); 1.1387 + u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]); 1.1388 + u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]); 1.1389 + u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]); 1.1390 + 1.1391 + // TODO(jingning): manually inline k_madd_epi32_ to further hide 1.1392 + // instruction latency. 1.1393 + v[ 0] = k_madd_epi32(u[0], k32_p16_m16); 1.1394 + v[ 1] = k_madd_epi32(u[1], k32_p16_m16); 1.1395 + v[ 2] = k_madd_epi32(u[2], k32_p16_m16); 1.1396 + v[ 3] = k_madd_epi32(u[3], k32_p16_m16); 1.1397 + v[ 4] = k_madd_epi32(u[0], k32_p16_p16); 1.1398 + v[ 5] = k_madd_epi32(u[1], k32_p16_p16); 1.1399 + v[ 6] = k_madd_epi32(u[2], k32_p16_p16); 1.1400 + v[ 7] = k_madd_epi32(u[3], k32_p16_p16); 1.1401 + 1.1402 + u[0] = k_packs_epi64(v[0], v[1]); 1.1403 + u[1] = k_packs_epi64(v[2], v[3]); 1.1404 + u[2] = k_packs_epi64(v[4], v[5]); 1.1405 + u[3] = k_packs_epi64(v[6], v[7]); 1.1406 + 1.1407 + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1.1408 + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1.1409 + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1.1410 + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1.1411 + 1.1412 + lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1.1413 + lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1.1414 + lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1.1415 + lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1.1416 + } 1.1417 + { 1.1418 + const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); 1.1419 + const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); 1.1420 + const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); 1.1421 + 1.1422 + u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]); 1.1423 + u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]); 1.1424 + u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]); 1.1425 + u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]); 1.1426 + u[ 4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]); 1.1427 + u[ 5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]); 1.1428 + u[ 6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]); 1.1429 + u[ 7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]); 1.1430 + u[ 8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]); 1.1431 + u[ 9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]); 1.1432 + u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]); 1.1433 + u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]); 1.1434 + u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]); 1.1435 + u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]); 1.1436 + u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]); 1.1437 + u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]); 1.1438 + 1.1439 + v[ 0] = k_madd_epi32(u[ 0], k32_m08_p24); 1.1440 + v[ 1] = k_madd_epi32(u[ 1], k32_m08_p24); 1.1441 + v[ 2] = k_madd_epi32(u[ 2], k32_m08_p24); 1.1442 + v[ 3] = k_madd_epi32(u[ 3], k32_m08_p24); 1.1443 + v[ 4] = k_madd_epi32(u[ 4], k32_m08_p24); 1.1444 + v[ 5] = k_madd_epi32(u[ 5], k32_m08_p24); 1.1445 + v[ 6] = k_madd_epi32(u[ 6], k32_m08_p24); 1.1446 + v[ 7] = k_madd_epi32(u[ 7], k32_m08_p24); 1.1447 + v[ 8] = k_madd_epi32(u[ 8], k32_m24_m08); 1.1448 + v[ 9] = k_madd_epi32(u[ 9], k32_m24_m08); 1.1449 + v[10] = k_madd_epi32(u[10], k32_m24_m08); 1.1450 + v[11] = k_madd_epi32(u[11], k32_m24_m08); 1.1451 + v[12] = k_madd_epi32(u[12], k32_m24_m08); 1.1452 + v[13] = k_madd_epi32(u[13], k32_m24_m08); 1.1453 + v[14] = k_madd_epi32(u[14], k32_m24_m08); 1.1454 + v[15] = k_madd_epi32(u[15], k32_m24_m08); 1.1455 + v[16] = k_madd_epi32(u[12], k32_m08_p24); 1.1456 + v[17] = k_madd_epi32(u[13], k32_m08_p24); 1.1457 + v[18] = k_madd_epi32(u[14], k32_m08_p24); 1.1458 + v[19] = k_madd_epi32(u[15], k32_m08_p24); 1.1459 + v[20] = k_madd_epi32(u[ 8], k32_m08_p24); 1.1460 + v[21] = k_madd_epi32(u[ 9], k32_m08_p24); 1.1461 + v[22] = k_madd_epi32(u[10], k32_m08_p24); 1.1462 + v[23] = k_madd_epi32(u[11], k32_m08_p24); 1.1463 + v[24] = k_madd_epi32(u[ 4], k32_p24_p08); 1.1464 + v[25] = k_madd_epi32(u[ 5], k32_p24_p08); 1.1465 + v[26] = k_madd_epi32(u[ 6], k32_p24_p08); 1.1466 + v[27] = k_madd_epi32(u[ 7], k32_p24_p08); 1.1467 + v[28] = k_madd_epi32(u[ 0], k32_p24_p08); 1.1468 + v[29] = k_madd_epi32(u[ 1], k32_p24_p08); 1.1469 + v[30] = k_madd_epi32(u[ 2], k32_p24_p08); 1.1470 + v[31] = k_madd_epi32(u[ 3], k32_p24_p08); 1.1471 + 1.1472 + u[ 0] = k_packs_epi64(v[ 0], v[ 1]); 1.1473 + u[ 1] = k_packs_epi64(v[ 2], v[ 3]); 1.1474 + u[ 2] = k_packs_epi64(v[ 4], v[ 5]); 1.1475 + u[ 3] = k_packs_epi64(v[ 6], v[ 7]); 1.1476 + u[ 4] = k_packs_epi64(v[ 8], v[ 9]); 1.1477 + u[ 5] = k_packs_epi64(v[10], v[11]); 1.1478 + u[ 6] = k_packs_epi64(v[12], v[13]); 1.1479 + u[ 7] = k_packs_epi64(v[14], v[15]); 1.1480 + u[ 8] = k_packs_epi64(v[16], v[17]); 1.1481 + u[ 9] = k_packs_epi64(v[18], v[19]); 1.1482 + u[10] = k_packs_epi64(v[20], v[21]); 1.1483 + u[11] = k_packs_epi64(v[22], v[23]); 1.1484 + u[12] = k_packs_epi64(v[24], v[25]); 1.1485 + u[13] = k_packs_epi64(v[26], v[27]); 1.1486 + u[14] = k_packs_epi64(v[28], v[29]); 1.1487 + u[15] = k_packs_epi64(v[30], v[31]); 1.1488 + 1.1489 + v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); 1.1490 + v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); 1.1491 + v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); 1.1492 + v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); 1.1493 + v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); 1.1494 + v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); 1.1495 + v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); 1.1496 + v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); 1.1497 + v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); 1.1498 + v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); 1.1499 + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1.1500 + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1.1501 + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1.1502 + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1.1503 + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1.1504 + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1.1505 + 1.1506 + lstep1[36] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); 1.1507 + lstep1[37] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); 1.1508 + lstep1[38] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); 1.1509 + lstep1[39] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); 1.1510 + lstep1[40] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); 1.1511 + lstep1[41] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); 1.1512 + lstep1[42] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); 1.1513 + lstep1[43] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); 1.1514 + lstep1[52] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); 1.1515 + lstep1[53] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); 1.1516 + lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1.1517 + lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1.1518 + lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1.1519 + lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1.1520 + lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1.1521 + lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1.1522 + } 1.1523 + // stage 5 1.1524 + { 1.1525 + lstep2[ 8] = _mm_add_epi32(lstep1[10], lstep3[ 8]); 1.1526 + lstep2[ 9] = _mm_add_epi32(lstep1[11], lstep3[ 9]); 1.1527 + lstep2[10] = _mm_sub_epi32(lstep3[ 8], lstep1[10]); 1.1528 + lstep2[11] = _mm_sub_epi32(lstep3[ 9], lstep1[11]); 1.1529 + lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]); 1.1530 + lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]); 1.1531 + lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]); 1.1532 + lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]); 1.1533 + } 1.1534 + { 1.1535 + const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); 1.1536 + const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); 1.1537 + const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); 1.1538 + const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); 1.1539 + 1.1540 + u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]); 1.1541 + u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]); 1.1542 + u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]); 1.1543 + u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]); 1.1544 + u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]); 1.1545 + u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]); 1.1546 + u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]); 1.1547 + u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]); 1.1548 + 1.1549 + // TODO(jingning): manually inline k_madd_epi32_ to further hide 1.1550 + // instruction latency. 1.1551 + v[ 0] = k_madd_epi32(u[0], k32_p16_p16); 1.1552 + v[ 1] = k_madd_epi32(u[1], k32_p16_p16); 1.1553 + v[ 2] = k_madd_epi32(u[2], k32_p16_p16); 1.1554 + v[ 3] = k_madd_epi32(u[3], k32_p16_p16); 1.1555 + v[ 4] = k_madd_epi32(u[0], k32_p16_m16); 1.1556 + v[ 5] = k_madd_epi32(u[1], k32_p16_m16); 1.1557 + v[ 6] = k_madd_epi32(u[2], k32_p16_m16); 1.1558 + v[ 7] = k_madd_epi32(u[3], k32_p16_m16); 1.1559 + v[ 8] = k_madd_epi32(u[4], k32_p24_p08); 1.1560 + v[ 9] = k_madd_epi32(u[5], k32_p24_p08); 1.1561 + v[10] = k_madd_epi32(u[6], k32_p24_p08); 1.1562 + v[11] = k_madd_epi32(u[7], k32_p24_p08); 1.1563 + v[12] = k_madd_epi32(u[4], k32_m08_p24); 1.1564 + v[13] = k_madd_epi32(u[5], k32_m08_p24); 1.1565 + v[14] = k_madd_epi32(u[6], k32_m08_p24); 1.1566 + v[15] = k_madd_epi32(u[7], k32_m08_p24); 1.1567 + 1.1568 + u[0] = k_packs_epi64(v[0], v[1]); 1.1569 + u[1] = k_packs_epi64(v[2], v[3]); 1.1570 + u[2] = k_packs_epi64(v[4], v[5]); 1.1571 + u[3] = k_packs_epi64(v[6], v[7]); 1.1572 + u[4] = k_packs_epi64(v[8], v[9]); 1.1573 + u[5] = k_packs_epi64(v[10], v[11]); 1.1574 + u[6] = k_packs_epi64(v[12], v[13]); 1.1575 + u[7] = k_packs_epi64(v[14], v[15]); 1.1576 + 1.1577 + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1.1578 + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1.1579 + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1.1580 + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1.1581 + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1.1582 + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1.1583 + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1.1584 + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1.1585 + 1.1586 + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1.1587 + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1.1588 + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1.1589 + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1.1590 + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1.1591 + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1.1592 + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1.1593 + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1.1594 + 1.1595 + sign[0] = _mm_cmplt_epi32(u[0], kZero); 1.1596 + sign[1] = _mm_cmplt_epi32(u[1], kZero); 1.1597 + sign[2] = _mm_cmplt_epi32(u[2], kZero); 1.1598 + sign[3] = _mm_cmplt_epi32(u[3], kZero); 1.1599 + sign[4] = _mm_cmplt_epi32(u[4], kZero); 1.1600 + sign[5] = _mm_cmplt_epi32(u[5], kZero); 1.1601 + sign[6] = _mm_cmplt_epi32(u[6], kZero); 1.1602 + sign[7] = _mm_cmplt_epi32(u[7], kZero); 1.1603 + 1.1604 + u[0] = _mm_sub_epi32(u[0], sign[0]); 1.1605 + u[1] = _mm_sub_epi32(u[1], sign[1]); 1.1606 + u[2] = _mm_sub_epi32(u[2], sign[2]); 1.1607 + u[3] = _mm_sub_epi32(u[3], sign[3]); 1.1608 + u[4] = _mm_sub_epi32(u[4], sign[4]); 1.1609 + u[5] = _mm_sub_epi32(u[5], sign[5]); 1.1610 + u[6] = _mm_sub_epi32(u[6], sign[6]); 1.1611 + u[7] = _mm_sub_epi32(u[7], sign[7]); 1.1612 + 1.1613 + u[0] = _mm_add_epi32(u[0], K32One); 1.1614 + u[1] = _mm_add_epi32(u[1], K32One); 1.1615 + u[2] = _mm_add_epi32(u[2], K32One); 1.1616 + u[3] = _mm_add_epi32(u[3], K32One); 1.1617 + u[4] = _mm_add_epi32(u[4], K32One); 1.1618 + u[5] = _mm_add_epi32(u[5], K32One); 1.1619 + u[6] = _mm_add_epi32(u[6], K32One); 1.1620 + u[7] = _mm_add_epi32(u[7], K32One); 1.1621 + 1.1622 + u[0] = _mm_srai_epi32(u[0], 2); 1.1623 + u[1] = _mm_srai_epi32(u[1], 2); 1.1624 + u[2] = _mm_srai_epi32(u[2], 2); 1.1625 + u[3] = _mm_srai_epi32(u[3], 2); 1.1626 + u[4] = _mm_srai_epi32(u[4], 2); 1.1627 + u[5] = _mm_srai_epi32(u[5], 2); 1.1628 + u[6] = _mm_srai_epi32(u[6], 2); 1.1629 + u[7] = _mm_srai_epi32(u[7], 2); 1.1630 + 1.1631 + // Combine 1.1632 + out[ 0] = _mm_packs_epi32(u[0], u[1]); 1.1633 + out[16] = _mm_packs_epi32(u[2], u[3]); 1.1634 + out[ 8] = _mm_packs_epi32(u[4], u[5]); 1.1635 + out[24] = _mm_packs_epi32(u[6], u[7]); 1.1636 + } 1.1637 + { 1.1638 + const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); 1.1639 + const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); 1.1640 + const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); 1.1641 + 1.1642 + u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]); 1.1643 + u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]); 1.1644 + u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]); 1.1645 + u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]); 1.1646 + u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]); 1.1647 + u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]); 1.1648 + u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]); 1.1649 + u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]); 1.1650 + 1.1651 + v[0] = k_madd_epi32(u[0], k32_m08_p24); 1.1652 + v[1] = k_madd_epi32(u[1], k32_m08_p24); 1.1653 + v[2] = k_madd_epi32(u[2], k32_m08_p24); 1.1654 + v[3] = k_madd_epi32(u[3], k32_m08_p24); 1.1655 + v[4] = k_madd_epi32(u[4], k32_m24_m08); 1.1656 + v[5] = k_madd_epi32(u[5], k32_m24_m08); 1.1657 + v[6] = k_madd_epi32(u[6], k32_m24_m08); 1.1658 + v[7] = k_madd_epi32(u[7], k32_m24_m08); 1.1659 + v[ 8] = k_madd_epi32(u[4], k32_m08_p24); 1.1660 + v[ 9] = k_madd_epi32(u[5], k32_m08_p24); 1.1661 + v[10] = k_madd_epi32(u[6], k32_m08_p24); 1.1662 + v[11] = k_madd_epi32(u[7], k32_m08_p24); 1.1663 + v[12] = k_madd_epi32(u[0], k32_p24_p08); 1.1664 + v[13] = k_madd_epi32(u[1], k32_p24_p08); 1.1665 + v[14] = k_madd_epi32(u[2], k32_p24_p08); 1.1666 + v[15] = k_madd_epi32(u[3], k32_p24_p08); 1.1667 + 1.1668 + u[0] = k_packs_epi64(v[0], v[1]); 1.1669 + u[1] = k_packs_epi64(v[2], v[3]); 1.1670 + u[2] = k_packs_epi64(v[4], v[5]); 1.1671 + u[3] = k_packs_epi64(v[6], v[7]); 1.1672 + u[4] = k_packs_epi64(v[8], v[9]); 1.1673 + u[5] = k_packs_epi64(v[10], v[11]); 1.1674 + u[6] = k_packs_epi64(v[12], v[13]); 1.1675 + u[7] = k_packs_epi64(v[14], v[15]); 1.1676 + 1.1677 + u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1.1678 + u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1.1679 + u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1.1680 + u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1.1681 + u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1.1682 + u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1.1683 + u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1.1684 + u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1.1685 + 1.1686 + lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS); 1.1687 + lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS); 1.1688 + lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS); 1.1689 + lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS); 1.1690 + lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS); 1.1691 + lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS); 1.1692 + lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS); 1.1693 + lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS); 1.1694 + } 1.1695 + { 1.1696 + lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]); 1.1697 + lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]); 1.1698 + lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]); 1.1699 + lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]); 1.1700 + lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]); 1.1701 + lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]); 1.1702 + lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]); 1.1703 + lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]); 1.1704 + lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]); 1.1705 + lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]); 1.1706 + lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]); 1.1707 + lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]); 1.1708 + lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]); 1.1709 + lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]); 1.1710 + lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]); 1.1711 + lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]); 1.1712 + lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]); 1.1713 + lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]); 1.1714 + lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]); 1.1715 + lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]); 1.1716 + lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]); 1.1717 + lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]); 1.1718 + lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]); 1.1719 + lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]); 1.1720 + lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]); 1.1721 + lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]); 1.1722 + lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]); 1.1723 + lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]); 1.1724 + lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]); 1.1725 + lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]); 1.1726 + lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]); 1.1727 + lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]); 1.1728 + } 1.1729 + // stage 6 1.1730 + { 1.1731 + const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); 1.1732 + const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); 1.1733 + const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); 1.1734 + const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); 1.1735 + 1.1736 + u[0] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]); 1.1737 + u[1] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]); 1.1738 + u[2] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]); 1.1739 + u[3] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]); 1.1740 + u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); 1.1741 + u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); 1.1742 + u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); 1.1743 + u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); 1.1744 + u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); 1.1745 + u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); 1.1746 + u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); 1.1747 + u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); 1.1748 + u[12] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]); 1.1749 + u[13] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]); 1.1750 + u[14] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]); 1.1751 + u[15] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]); 1.1752 + 1.1753 + v[0] = k_madd_epi32(u[0], k32_p28_p04); 1.1754 + v[1] = k_madd_epi32(u[1], k32_p28_p04); 1.1755 + v[2] = k_madd_epi32(u[2], k32_p28_p04); 1.1756 + v[3] = k_madd_epi32(u[3], k32_p28_p04); 1.1757 + v[4] = k_madd_epi32(u[4], k32_p12_p20); 1.1758 + v[5] = k_madd_epi32(u[5], k32_p12_p20); 1.1759 + v[6] = k_madd_epi32(u[6], k32_p12_p20); 1.1760 + v[7] = k_madd_epi32(u[7], k32_p12_p20); 1.1761 + v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12); 1.1762 + v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12); 1.1763 + v[10] = k_madd_epi32(u[10], k32_m20_p12); 1.1764 + v[11] = k_madd_epi32(u[11], k32_m20_p12); 1.1765 + v[12] = k_madd_epi32(u[12], k32_m04_p28); 1.1766 + v[13] = k_madd_epi32(u[13], k32_m04_p28); 1.1767 + v[14] = k_madd_epi32(u[14], k32_m04_p28); 1.1768 + v[15] = k_madd_epi32(u[15], k32_m04_p28); 1.1769 + 1.1770 + u[0] = k_packs_epi64(v[0], v[1]); 1.1771 + u[1] = k_packs_epi64(v[2], v[3]); 1.1772 + u[2] = k_packs_epi64(v[4], v[5]); 1.1773 + u[3] = k_packs_epi64(v[6], v[7]); 1.1774 + u[4] = k_packs_epi64(v[8], v[9]); 1.1775 + u[5] = k_packs_epi64(v[10], v[11]); 1.1776 + u[6] = k_packs_epi64(v[12], v[13]); 1.1777 + u[7] = k_packs_epi64(v[14], v[15]); 1.1778 + 1.1779 + v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); 1.1780 + v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); 1.1781 + v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); 1.1782 + v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); 1.1783 + v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); 1.1784 + v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); 1.1785 + v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); 1.1786 + v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); 1.1787 + 1.1788 + u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); 1.1789 + u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); 1.1790 + u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); 1.1791 + u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); 1.1792 + u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); 1.1793 + u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); 1.1794 + u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); 1.1795 + u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); 1.1796 + 1.1797 + sign[0] = _mm_cmplt_epi32(u[0], kZero); 1.1798 + sign[1] = _mm_cmplt_epi32(u[1], kZero); 1.1799 + sign[2] = _mm_cmplt_epi32(u[2], kZero); 1.1800 + sign[3] = _mm_cmplt_epi32(u[3], kZero); 1.1801 + sign[4] = _mm_cmplt_epi32(u[4], kZero); 1.1802 + sign[5] = _mm_cmplt_epi32(u[5], kZero); 1.1803 + sign[6] = _mm_cmplt_epi32(u[6], kZero); 1.1804 + sign[7] = _mm_cmplt_epi32(u[7], kZero); 1.1805 + 1.1806 + u[0] = _mm_sub_epi32(u[0], sign[0]); 1.1807 + u[1] = _mm_sub_epi32(u[1], sign[1]); 1.1808 + u[2] = _mm_sub_epi32(u[2], sign[2]); 1.1809 + u[3] = _mm_sub_epi32(u[3], sign[3]); 1.1810 + u[4] = _mm_sub_epi32(u[4], sign[4]); 1.1811 + u[5] = _mm_sub_epi32(u[5], sign[5]); 1.1812 + u[6] = _mm_sub_epi32(u[6], sign[6]); 1.1813 + u[7] = _mm_sub_epi32(u[7], sign[7]); 1.1814 + 1.1815 + u[0] = _mm_add_epi32(u[0], K32One); 1.1816 + u[1] = _mm_add_epi32(u[1], K32One); 1.1817 + u[2] = _mm_add_epi32(u[2], K32One); 1.1818 + u[3] = _mm_add_epi32(u[3], K32One); 1.1819 + u[4] = _mm_add_epi32(u[4], K32One); 1.1820 + u[5] = _mm_add_epi32(u[5], K32One); 1.1821 + u[6] = _mm_add_epi32(u[6], K32One); 1.1822 + u[7] = _mm_add_epi32(u[7], K32One); 1.1823 + 1.1824 + u[0] = _mm_srai_epi32(u[0], 2); 1.1825 + u[1] = _mm_srai_epi32(u[1], 2); 1.1826 + u[2] = _mm_srai_epi32(u[2], 2); 1.1827 + u[3] = _mm_srai_epi32(u[3], 2); 1.1828 + u[4] = _mm_srai_epi32(u[4], 2); 1.1829 + u[5] = _mm_srai_epi32(u[5], 2); 1.1830 + u[6] = _mm_srai_epi32(u[6], 2); 1.1831 + u[7] = _mm_srai_epi32(u[7], 2); 1.1832 + 1.1833 + out[ 4] = _mm_packs_epi32(u[0], u[1]); 1.1834 + out[20] = _mm_packs_epi32(u[2], u[3]); 1.1835 + out[12] = _mm_packs_epi32(u[4], u[5]); 1.1836 + out[28] = _mm_packs_epi32(u[6], u[7]); 1.1837 + } 1.1838 + { 1.1839 + lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]); 1.1840 + lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]); 1.1841 + lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]); 1.1842 + lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]); 1.1843 + lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]); 1.1844 + lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]); 1.1845 + lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]); 1.1846 + lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]); 1.1847 + lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]); 1.1848 + lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]); 1.1849 + lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]); 1.1850 + lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]); 1.1851 + lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]); 1.1852 + lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]); 1.1853 + lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]); 1.1854 + lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]); 1.1855 + } 1.1856 + { 1.1857 + const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); 1.1858 + const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64); 1.1859 + const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); 1.1860 + const __m128i k32_m12_m20 = pair_set_epi32(-cospi_12_64, 1.1861 + -cospi_20_64); 1.1862 + const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); 1.1863 + const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); 1.1864 + 1.1865 + u[ 0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]); 1.1866 + u[ 1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]); 1.1867 + u[ 2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]); 1.1868 + u[ 3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]); 1.1869 + u[ 4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]); 1.1870 + u[ 5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]); 1.1871 + u[ 6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]); 1.1872 + u[ 7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]); 1.1873 + u[ 8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]); 1.1874 + u[ 9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]); 1.1875 + u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]); 1.1876 + u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]); 1.1877 + u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]); 1.1878 + u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]); 1.1879 + u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]); 1.1880 + u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]); 1.1881 + 1.1882 + v[ 0] = k_madd_epi32(u[ 0], k32_m04_p28); 1.1883 + v[ 1] = k_madd_epi32(u[ 1], k32_m04_p28); 1.1884 + v[ 2] = k_madd_epi32(u[ 2], k32_m04_p28); 1.1885 + v[ 3] = k_madd_epi32(u[ 3], k32_m04_p28); 1.1886 + v[ 4] = k_madd_epi32(u[ 4], k32_m28_m04); 1.1887 + v[ 5] = k_madd_epi32(u[ 5], k32_m28_m04); 1.1888 + v[ 6] = k_madd_epi32(u[ 6], k32_m28_m04); 1.1889 + v[ 7] = k_madd_epi32(u[ 7], k32_m28_m04); 1.1890 + v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12); 1.1891 + v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12); 1.1892 + v[10] = k_madd_epi32(u[10], k32_m20_p12); 1.1893 + v[11] = k_madd_epi32(u[11], k32_m20_p12); 1.1894 + v[12] = k_madd_epi32(u[12], k32_m12_m20); 1.1895 + v[13] = k_madd_epi32(u[13], k32_m12_m20); 1.1896 + v[14] = k_madd_epi32(u[14], k32_m12_m20); 1.1897 + v[15] = k_madd_epi32(u[15], k32_m12_m20); 1.1898 + v[16] = k_madd_epi32(u[12], k32_m20_p12); 1.1899 + v[17] = k_madd_epi32(u[13], k32_m20_p12); 1.1900 + v[18] = k_madd_epi32(u[14], k32_m20_p12); 1.1901 + v[19] = k_madd_epi32(u[15], k32_m20_p12); 1.1902 + v[20] = k_madd_epi32(u[ 8], k32_p12_p20); 1.1903 + v[21] = k_madd_epi32(u[ 9], k32_p12_p20); 1.1904 + v[22] = k_madd_epi32(u[10], k32_p12_p20); 1.1905 + v[23] = k_madd_epi32(u[11], k32_p12_p20); 1.1906 + v[24] = k_madd_epi32(u[ 4], k32_m04_p28); 1.1907 + v[25] = k_madd_epi32(u[ 5], k32_m04_p28); 1.1908 + v[26] = k_madd_epi32(u[ 6], k32_m04_p28); 1.1909 + v[27] = k_madd_epi32(u[ 7], k32_m04_p28); 1.1910 + v[28] = k_madd_epi32(u[ 0], k32_p28_p04); 1.1911 + v[29] = k_madd_epi32(u[ 1], k32_p28_p04); 1.1912 + v[30] = k_madd_epi32(u[ 2], k32_p28_p04); 1.1913 + v[31] = k_madd_epi32(u[ 3], k32_p28_p04); 1.1914 + 1.1915 + u[ 0] = k_packs_epi64(v[ 0], v[ 1]); 1.1916 + u[ 1] = k_packs_epi64(v[ 2], v[ 3]); 1.1917 + u[ 2] = k_packs_epi64(v[ 4], v[ 5]); 1.1918 + u[ 3] = k_packs_epi64(v[ 6], v[ 7]); 1.1919 + u[ 4] = k_packs_epi64(v[ 8], v[ 9]); 1.1920 + u[ 5] = k_packs_epi64(v[10], v[11]); 1.1921 + u[ 6] = k_packs_epi64(v[12], v[13]); 1.1922 + u[ 7] = k_packs_epi64(v[14], v[15]); 1.1923 + u[ 8] = k_packs_epi64(v[16], v[17]); 1.1924 + u[ 9] = k_packs_epi64(v[18], v[19]); 1.1925 + u[10] = k_packs_epi64(v[20], v[21]); 1.1926 + u[11] = k_packs_epi64(v[22], v[23]); 1.1927 + u[12] = k_packs_epi64(v[24], v[25]); 1.1928 + u[13] = k_packs_epi64(v[26], v[27]); 1.1929 + u[14] = k_packs_epi64(v[28], v[29]); 1.1930 + u[15] = k_packs_epi64(v[30], v[31]); 1.1931 + 1.1932 + v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); 1.1933 + v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); 1.1934 + v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); 1.1935 + v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); 1.1936 + v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); 1.1937 + v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); 1.1938 + v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); 1.1939 + v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); 1.1940 + v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); 1.1941 + v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); 1.1942 + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1.1943 + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1.1944 + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1.1945 + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1.1946 + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1.1947 + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1.1948 + 1.1949 + lstep3[34] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); 1.1950 + lstep3[35] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); 1.1951 + lstep3[36] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); 1.1952 + lstep3[37] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); 1.1953 + lstep3[42] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); 1.1954 + lstep3[43] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); 1.1955 + lstep3[44] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); 1.1956 + lstep3[45] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); 1.1957 + lstep3[50] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); 1.1958 + lstep3[51] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); 1.1959 + lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1.1960 + lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1.1961 + lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1.1962 + lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1.1963 + lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1.1964 + lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1.1965 + } 1.1966 + // stage 7 1.1967 + { 1.1968 + const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64); 1.1969 + const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64); 1.1970 + const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64); 1.1971 + const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64); 1.1972 + const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64); 1.1973 + const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64); 1.1974 + const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64); 1.1975 + const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64); 1.1976 + 1.1977 + u[ 0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]); 1.1978 + u[ 1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]); 1.1979 + u[ 2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]); 1.1980 + u[ 3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]); 1.1981 + u[ 4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]); 1.1982 + u[ 5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]); 1.1983 + u[ 6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]); 1.1984 + u[ 7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]); 1.1985 + u[ 8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]); 1.1986 + u[ 9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]); 1.1987 + u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]); 1.1988 + u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]); 1.1989 + u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]); 1.1990 + u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]); 1.1991 + u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]); 1.1992 + u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]); 1.1993 + 1.1994 + v[ 0] = k_madd_epi32(u[ 0], k32_p30_p02); 1.1995 + v[ 1] = k_madd_epi32(u[ 1], k32_p30_p02); 1.1996 + v[ 2] = k_madd_epi32(u[ 2], k32_p30_p02); 1.1997 + v[ 3] = k_madd_epi32(u[ 3], k32_p30_p02); 1.1998 + v[ 4] = k_madd_epi32(u[ 4], k32_p14_p18); 1.1999 + v[ 5] = k_madd_epi32(u[ 5], k32_p14_p18); 1.2000 + v[ 6] = k_madd_epi32(u[ 6], k32_p14_p18); 1.2001 + v[ 7] = k_madd_epi32(u[ 7], k32_p14_p18); 1.2002 + v[ 8] = k_madd_epi32(u[ 8], k32_p22_p10); 1.2003 + v[ 9] = k_madd_epi32(u[ 9], k32_p22_p10); 1.2004 + v[10] = k_madd_epi32(u[10], k32_p22_p10); 1.2005 + v[11] = k_madd_epi32(u[11], k32_p22_p10); 1.2006 + v[12] = k_madd_epi32(u[12], k32_p06_p26); 1.2007 + v[13] = k_madd_epi32(u[13], k32_p06_p26); 1.2008 + v[14] = k_madd_epi32(u[14], k32_p06_p26); 1.2009 + v[15] = k_madd_epi32(u[15], k32_p06_p26); 1.2010 + v[16] = k_madd_epi32(u[12], k32_m26_p06); 1.2011 + v[17] = k_madd_epi32(u[13], k32_m26_p06); 1.2012 + v[18] = k_madd_epi32(u[14], k32_m26_p06); 1.2013 + v[19] = k_madd_epi32(u[15], k32_m26_p06); 1.2014 + v[20] = k_madd_epi32(u[ 8], k32_m10_p22); 1.2015 + v[21] = k_madd_epi32(u[ 9], k32_m10_p22); 1.2016 + v[22] = k_madd_epi32(u[10], k32_m10_p22); 1.2017 + v[23] = k_madd_epi32(u[11], k32_m10_p22); 1.2018 + v[24] = k_madd_epi32(u[ 4], k32_m18_p14); 1.2019 + v[25] = k_madd_epi32(u[ 5], k32_m18_p14); 1.2020 + v[26] = k_madd_epi32(u[ 6], k32_m18_p14); 1.2021 + v[27] = k_madd_epi32(u[ 7], k32_m18_p14); 1.2022 + v[28] = k_madd_epi32(u[ 0], k32_m02_p30); 1.2023 + v[29] = k_madd_epi32(u[ 1], k32_m02_p30); 1.2024 + v[30] = k_madd_epi32(u[ 2], k32_m02_p30); 1.2025 + v[31] = k_madd_epi32(u[ 3], k32_m02_p30); 1.2026 + 1.2027 + u[ 0] = k_packs_epi64(v[ 0], v[ 1]); 1.2028 + u[ 1] = k_packs_epi64(v[ 2], v[ 3]); 1.2029 + u[ 2] = k_packs_epi64(v[ 4], v[ 5]); 1.2030 + u[ 3] = k_packs_epi64(v[ 6], v[ 7]); 1.2031 + u[ 4] = k_packs_epi64(v[ 8], v[ 9]); 1.2032 + u[ 5] = k_packs_epi64(v[10], v[11]); 1.2033 + u[ 6] = k_packs_epi64(v[12], v[13]); 1.2034 + u[ 7] = k_packs_epi64(v[14], v[15]); 1.2035 + u[ 8] = k_packs_epi64(v[16], v[17]); 1.2036 + u[ 9] = k_packs_epi64(v[18], v[19]); 1.2037 + u[10] = k_packs_epi64(v[20], v[21]); 1.2038 + u[11] = k_packs_epi64(v[22], v[23]); 1.2039 + u[12] = k_packs_epi64(v[24], v[25]); 1.2040 + u[13] = k_packs_epi64(v[26], v[27]); 1.2041 + u[14] = k_packs_epi64(v[28], v[29]); 1.2042 + u[15] = k_packs_epi64(v[30], v[31]); 1.2043 + 1.2044 + v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); 1.2045 + v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); 1.2046 + v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); 1.2047 + v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); 1.2048 + v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); 1.2049 + v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); 1.2050 + v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); 1.2051 + v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); 1.2052 + v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); 1.2053 + v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); 1.2054 + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1.2055 + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1.2056 + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1.2057 + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1.2058 + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1.2059 + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1.2060 + 1.2061 + u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); 1.2062 + u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); 1.2063 + u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); 1.2064 + u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); 1.2065 + u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); 1.2066 + u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); 1.2067 + u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); 1.2068 + u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); 1.2069 + u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); 1.2070 + u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); 1.2071 + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1.2072 + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1.2073 + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1.2074 + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1.2075 + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1.2076 + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1.2077 + 1.2078 + v[ 0] = _mm_cmplt_epi32(u[ 0], kZero); 1.2079 + v[ 1] = _mm_cmplt_epi32(u[ 1], kZero); 1.2080 + v[ 2] = _mm_cmplt_epi32(u[ 2], kZero); 1.2081 + v[ 3] = _mm_cmplt_epi32(u[ 3], kZero); 1.2082 + v[ 4] = _mm_cmplt_epi32(u[ 4], kZero); 1.2083 + v[ 5] = _mm_cmplt_epi32(u[ 5], kZero); 1.2084 + v[ 6] = _mm_cmplt_epi32(u[ 6], kZero); 1.2085 + v[ 7] = _mm_cmplt_epi32(u[ 7], kZero); 1.2086 + v[ 8] = _mm_cmplt_epi32(u[ 8], kZero); 1.2087 + v[ 9] = _mm_cmplt_epi32(u[ 9], kZero); 1.2088 + v[10] = _mm_cmplt_epi32(u[10], kZero); 1.2089 + v[11] = _mm_cmplt_epi32(u[11], kZero); 1.2090 + v[12] = _mm_cmplt_epi32(u[12], kZero); 1.2091 + v[13] = _mm_cmplt_epi32(u[13], kZero); 1.2092 + v[14] = _mm_cmplt_epi32(u[14], kZero); 1.2093 + v[15] = _mm_cmplt_epi32(u[15], kZero); 1.2094 + 1.2095 + u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]); 1.2096 + u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]); 1.2097 + u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]); 1.2098 + u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]); 1.2099 + u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]); 1.2100 + u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]); 1.2101 + u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]); 1.2102 + u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]); 1.2103 + u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]); 1.2104 + u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]); 1.2105 + u[10] = _mm_sub_epi32(u[10], v[10]); 1.2106 + u[11] = _mm_sub_epi32(u[11], v[11]); 1.2107 + u[12] = _mm_sub_epi32(u[12], v[12]); 1.2108 + u[13] = _mm_sub_epi32(u[13], v[13]); 1.2109 + u[14] = _mm_sub_epi32(u[14], v[14]); 1.2110 + u[15] = _mm_sub_epi32(u[15], v[15]); 1.2111 + 1.2112 + v[ 0] = _mm_add_epi32(u[ 0], K32One); 1.2113 + v[ 1] = _mm_add_epi32(u[ 1], K32One); 1.2114 + v[ 2] = _mm_add_epi32(u[ 2], K32One); 1.2115 + v[ 3] = _mm_add_epi32(u[ 3], K32One); 1.2116 + v[ 4] = _mm_add_epi32(u[ 4], K32One); 1.2117 + v[ 5] = _mm_add_epi32(u[ 5], K32One); 1.2118 + v[ 6] = _mm_add_epi32(u[ 6], K32One); 1.2119 + v[ 7] = _mm_add_epi32(u[ 7], K32One); 1.2120 + v[ 8] = _mm_add_epi32(u[ 8], K32One); 1.2121 + v[ 9] = _mm_add_epi32(u[ 9], K32One); 1.2122 + v[10] = _mm_add_epi32(u[10], K32One); 1.2123 + v[11] = _mm_add_epi32(u[11], K32One); 1.2124 + v[12] = _mm_add_epi32(u[12], K32One); 1.2125 + v[13] = _mm_add_epi32(u[13], K32One); 1.2126 + v[14] = _mm_add_epi32(u[14], K32One); 1.2127 + v[15] = _mm_add_epi32(u[15], K32One); 1.2128 + 1.2129 + u[ 0] = _mm_srai_epi32(v[ 0], 2); 1.2130 + u[ 1] = _mm_srai_epi32(v[ 1], 2); 1.2131 + u[ 2] = _mm_srai_epi32(v[ 2], 2); 1.2132 + u[ 3] = _mm_srai_epi32(v[ 3], 2); 1.2133 + u[ 4] = _mm_srai_epi32(v[ 4], 2); 1.2134 + u[ 5] = _mm_srai_epi32(v[ 5], 2); 1.2135 + u[ 6] = _mm_srai_epi32(v[ 6], 2); 1.2136 + u[ 7] = _mm_srai_epi32(v[ 7], 2); 1.2137 + u[ 8] = _mm_srai_epi32(v[ 8], 2); 1.2138 + u[ 9] = _mm_srai_epi32(v[ 9], 2); 1.2139 + u[10] = _mm_srai_epi32(v[10], 2); 1.2140 + u[11] = _mm_srai_epi32(v[11], 2); 1.2141 + u[12] = _mm_srai_epi32(v[12], 2); 1.2142 + u[13] = _mm_srai_epi32(v[13], 2); 1.2143 + u[14] = _mm_srai_epi32(v[14], 2); 1.2144 + u[15] = _mm_srai_epi32(v[15], 2); 1.2145 + 1.2146 + out[ 2] = _mm_packs_epi32(u[0], u[1]); 1.2147 + out[18] = _mm_packs_epi32(u[2], u[3]); 1.2148 + out[10] = _mm_packs_epi32(u[4], u[5]); 1.2149 + out[26] = _mm_packs_epi32(u[6], u[7]); 1.2150 + out[ 6] = _mm_packs_epi32(u[8], u[9]); 1.2151 + out[22] = _mm_packs_epi32(u[10], u[11]); 1.2152 + out[14] = _mm_packs_epi32(u[12], u[13]); 1.2153 + out[30] = _mm_packs_epi32(u[14], u[15]); 1.2154 + } 1.2155 + { 1.2156 + lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]); 1.2157 + lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]); 1.2158 + lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]); 1.2159 + lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]); 1.2160 + lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]); 1.2161 + lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]); 1.2162 + lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]); 1.2163 + lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]); 1.2164 + lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]); 1.2165 + lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]); 1.2166 + lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]); 1.2167 + lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]); 1.2168 + lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]); 1.2169 + lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]); 1.2170 + lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]); 1.2171 + lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]); 1.2172 + lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]); 1.2173 + lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]); 1.2174 + lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]); 1.2175 + lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]); 1.2176 + lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]); 1.2177 + lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]); 1.2178 + lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]); 1.2179 + lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]); 1.2180 + lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]); 1.2181 + lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]); 1.2182 + lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]); 1.2183 + lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]); 1.2184 + lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]); 1.2185 + lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]); 1.2186 + lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]); 1.2187 + lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]); 1.2188 + } 1.2189 + // stage 8 1.2190 + { 1.2191 + const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64); 1.2192 + const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64); 1.2193 + const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64); 1.2194 + const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64); 1.2195 + const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64); 1.2196 + const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64); 1.2197 + const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64); 1.2198 + const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64); 1.2199 + 1.2200 + u[ 0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]); 1.2201 + u[ 1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]); 1.2202 + u[ 2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]); 1.2203 + u[ 3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]); 1.2204 + u[ 4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]); 1.2205 + u[ 5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]); 1.2206 + u[ 6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]); 1.2207 + u[ 7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]); 1.2208 + u[ 8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]); 1.2209 + u[ 9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]); 1.2210 + u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]); 1.2211 + u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]); 1.2212 + u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]); 1.2213 + u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]); 1.2214 + u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]); 1.2215 + u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]); 1.2216 + 1.2217 + v[ 0] = k_madd_epi32(u[ 0], k32_p31_p01); 1.2218 + v[ 1] = k_madd_epi32(u[ 1], k32_p31_p01); 1.2219 + v[ 2] = k_madd_epi32(u[ 2], k32_p31_p01); 1.2220 + v[ 3] = k_madd_epi32(u[ 3], k32_p31_p01); 1.2221 + v[ 4] = k_madd_epi32(u[ 4], k32_p15_p17); 1.2222 + v[ 5] = k_madd_epi32(u[ 5], k32_p15_p17); 1.2223 + v[ 6] = k_madd_epi32(u[ 6], k32_p15_p17); 1.2224 + v[ 7] = k_madd_epi32(u[ 7], k32_p15_p17); 1.2225 + v[ 8] = k_madd_epi32(u[ 8], k32_p23_p09); 1.2226 + v[ 9] = k_madd_epi32(u[ 9], k32_p23_p09); 1.2227 + v[10] = k_madd_epi32(u[10], k32_p23_p09); 1.2228 + v[11] = k_madd_epi32(u[11], k32_p23_p09); 1.2229 + v[12] = k_madd_epi32(u[12], k32_p07_p25); 1.2230 + v[13] = k_madd_epi32(u[13], k32_p07_p25); 1.2231 + v[14] = k_madd_epi32(u[14], k32_p07_p25); 1.2232 + v[15] = k_madd_epi32(u[15], k32_p07_p25); 1.2233 + v[16] = k_madd_epi32(u[12], k32_m25_p07); 1.2234 + v[17] = k_madd_epi32(u[13], k32_m25_p07); 1.2235 + v[18] = k_madd_epi32(u[14], k32_m25_p07); 1.2236 + v[19] = k_madd_epi32(u[15], k32_m25_p07); 1.2237 + v[20] = k_madd_epi32(u[ 8], k32_m09_p23); 1.2238 + v[21] = k_madd_epi32(u[ 9], k32_m09_p23); 1.2239 + v[22] = k_madd_epi32(u[10], k32_m09_p23); 1.2240 + v[23] = k_madd_epi32(u[11], k32_m09_p23); 1.2241 + v[24] = k_madd_epi32(u[ 4], k32_m17_p15); 1.2242 + v[25] = k_madd_epi32(u[ 5], k32_m17_p15); 1.2243 + v[26] = k_madd_epi32(u[ 6], k32_m17_p15); 1.2244 + v[27] = k_madd_epi32(u[ 7], k32_m17_p15); 1.2245 + v[28] = k_madd_epi32(u[ 0], k32_m01_p31); 1.2246 + v[29] = k_madd_epi32(u[ 1], k32_m01_p31); 1.2247 + v[30] = k_madd_epi32(u[ 2], k32_m01_p31); 1.2248 + v[31] = k_madd_epi32(u[ 3], k32_m01_p31); 1.2249 + 1.2250 + u[ 0] = k_packs_epi64(v[ 0], v[ 1]); 1.2251 + u[ 1] = k_packs_epi64(v[ 2], v[ 3]); 1.2252 + u[ 2] = k_packs_epi64(v[ 4], v[ 5]); 1.2253 + u[ 3] = k_packs_epi64(v[ 6], v[ 7]); 1.2254 + u[ 4] = k_packs_epi64(v[ 8], v[ 9]); 1.2255 + u[ 5] = k_packs_epi64(v[10], v[11]); 1.2256 + u[ 6] = k_packs_epi64(v[12], v[13]); 1.2257 + u[ 7] = k_packs_epi64(v[14], v[15]); 1.2258 + u[ 8] = k_packs_epi64(v[16], v[17]); 1.2259 + u[ 9] = k_packs_epi64(v[18], v[19]); 1.2260 + u[10] = k_packs_epi64(v[20], v[21]); 1.2261 + u[11] = k_packs_epi64(v[22], v[23]); 1.2262 + u[12] = k_packs_epi64(v[24], v[25]); 1.2263 + u[13] = k_packs_epi64(v[26], v[27]); 1.2264 + u[14] = k_packs_epi64(v[28], v[29]); 1.2265 + u[15] = k_packs_epi64(v[30], v[31]); 1.2266 + 1.2267 + v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); 1.2268 + v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); 1.2269 + v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); 1.2270 + v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); 1.2271 + v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); 1.2272 + v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); 1.2273 + v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); 1.2274 + v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); 1.2275 + v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); 1.2276 + v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); 1.2277 + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1.2278 + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1.2279 + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1.2280 + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1.2281 + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1.2282 + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1.2283 + 1.2284 + u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); 1.2285 + u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); 1.2286 + u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); 1.2287 + u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); 1.2288 + u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); 1.2289 + u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); 1.2290 + u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); 1.2291 + u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); 1.2292 + u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); 1.2293 + u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); 1.2294 + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1.2295 + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1.2296 + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1.2297 + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1.2298 + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1.2299 + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1.2300 + 1.2301 + v[ 0] = _mm_cmplt_epi32(u[ 0], kZero); 1.2302 + v[ 1] = _mm_cmplt_epi32(u[ 1], kZero); 1.2303 + v[ 2] = _mm_cmplt_epi32(u[ 2], kZero); 1.2304 + v[ 3] = _mm_cmplt_epi32(u[ 3], kZero); 1.2305 + v[ 4] = _mm_cmplt_epi32(u[ 4], kZero); 1.2306 + v[ 5] = _mm_cmplt_epi32(u[ 5], kZero); 1.2307 + v[ 6] = _mm_cmplt_epi32(u[ 6], kZero); 1.2308 + v[ 7] = _mm_cmplt_epi32(u[ 7], kZero); 1.2309 + v[ 8] = _mm_cmplt_epi32(u[ 8], kZero); 1.2310 + v[ 9] = _mm_cmplt_epi32(u[ 9], kZero); 1.2311 + v[10] = _mm_cmplt_epi32(u[10], kZero); 1.2312 + v[11] = _mm_cmplt_epi32(u[11], kZero); 1.2313 + v[12] = _mm_cmplt_epi32(u[12], kZero); 1.2314 + v[13] = _mm_cmplt_epi32(u[13], kZero); 1.2315 + v[14] = _mm_cmplt_epi32(u[14], kZero); 1.2316 + v[15] = _mm_cmplt_epi32(u[15], kZero); 1.2317 + 1.2318 + u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]); 1.2319 + u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]); 1.2320 + u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]); 1.2321 + u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]); 1.2322 + u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]); 1.2323 + u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]); 1.2324 + u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]); 1.2325 + u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]); 1.2326 + u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]); 1.2327 + u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]); 1.2328 + u[10] = _mm_sub_epi32(u[10], v[10]); 1.2329 + u[11] = _mm_sub_epi32(u[11], v[11]); 1.2330 + u[12] = _mm_sub_epi32(u[12], v[12]); 1.2331 + u[13] = _mm_sub_epi32(u[13], v[13]); 1.2332 + u[14] = _mm_sub_epi32(u[14], v[14]); 1.2333 + u[15] = _mm_sub_epi32(u[15], v[15]); 1.2334 + 1.2335 + v[0] = _mm_add_epi32(u[0], K32One); 1.2336 + v[1] = _mm_add_epi32(u[1], K32One); 1.2337 + v[2] = _mm_add_epi32(u[2], K32One); 1.2338 + v[3] = _mm_add_epi32(u[3], K32One); 1.2339 + v[4] = _mm_add_epi32(u[4], K32One); 1.2340 + v[5] = _mm_add_epi32(u[5], K32One); 1.2341 + v[6] = _mm_add_epi32(u[6], K32One); 1.2342 + v[7] = _mm_add_epi32(u[7], K32One); 1.2343 + v[8] = _mm_add_epi32(u[8], K32One); 1.2344 + v[9] = _mm_add_epi32(u[9], K32One); 1.2345 + v[10] = _mm_add_epi32(u[10], K32One); 1.2346 + v[11] = _mm_add_epi32(u[11], K32One); 1.2347 + v[12] = _mm_add_epi32(u[12], K32One); 1.2348 + v[13] = _mm_add_epi32(u[13], K32One); 1.2349 + v[14] = _mm_add_epi32(u[14], K32One); 1.2350 + v[15] = _mm_add_epi32(u[15], K32One); 1.2351 + 1.2352 + u[0] = _mm_srai_epi32(v[0], 2); 1.2353 + u[1] = _mm_srai_epi32(v[1], 2); 1.2354 + u[2] = _mm_srai_epi32(v[2], 2); 1.2355 + u[3] = _mm_srai_epi32(v[3], 2); 1.2356 + u[4] = _mm_srai_epi32(v[4], 2); 1.2357 + u[5] = _mm_srai_epi32(v[5], 2); 1.2358 + u[6] = _mm_srai_epi32(v[6], 2); 1.2359 + u[7] = _mm_srai_epi32(v[7], 2); 1.2360 + u[8] = _mm_srai_epi32(v[8], 2); 1.2361 + u[9] = _mm_srai_epi32(v[9], 2); 1.2362 + u[10] = _mm_srai_epi32(v[10], 2); 1.2363 + u[11] = _mm_srai_epi32(v[11], 2); 1.2364 + u[12] = _mm_srai_epi32(v[12], 2); 1.2365 + u[13] = _mm_srai_epi32(v[13], 2); 1.2366 + u[14] = _mm_srai_epi32(v[14], 2); 1.2367 + u[15] = _mm_srai_epi32(v[15], 2); 1.2368 + 1.2369 + out[ 1] = _mm_packs_epi32(u[0], u[1]); 1.2370 + out[17] = _mm_packs_epi32(u[2], u[3]); 1.2371 + out[ 9] = _mm_packs_epi32(u[4], u[5]); 1.2372 + out[25] = _mm_packs_epi32(u[6], u[7]); 1.2373 + out[ 7] = _mm_packs_epi32(u[8], u[9]); 1.2374 + out[23] = _mm_packs_epi32(u[10], u[11]); 1.2375 + out[15] = _mm_packs_epi32(u[12], u[13]); 1.2376 + out[31] = _mm_packs_epi32(u[14], u[15]); 1.2377 + } 1.2378 + { 1.2379 + const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64); 1.2380 + const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64); 1.2381 + const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64); 1.2382 + const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64); 1.2383 + const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64); 1.2384 + const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64); 1.2385 + const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64); 1.2386 + const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64); 1.2387 + 1.2388 + u[ 0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]); 1.2389 + u[ 1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]); 1.2390 + u[ 2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]); 1.2391 + u[ 3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]); 1.2392 + u[ 4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]); 1.2393 + u[ 5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]); 1.2394 + u[ 6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]); 1.2395 + u[ 7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]); 1.2396 + u[ 8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]); 1.2397 + u[ 9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]); 1.2398 + u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]); 1.2399 + u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]); 1.2400 + u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]); 1.2401 + u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]); 1.2402 + u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]); 1.2403 + u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]); 1.2404 + 1.2405 + v[ 0] = k_madd_epi32(u[ 0], k32_p27_p05); 1.2406 + v[ 1] = k_madd_epi32(u[ 1], k32_p27_p05); 1.2407 + v[ 2] = k_madd_epi32(u[ 2], k32_p27_p05); 1.2408 + v[ 3] = k_madd_epi32(u[ 3], k32_p27_p05); 1.2409 + v[ 4] = k_madd_epi32(u[ 4], k32_p11_p21); 1.2410 + v[ 5] = k_madd_epi32(u[ 5], k32_p11_p21); 1.2411 + v[ 6] = k_madd_epi32(u[ 6], k32_p11_p21); 1.2412 + v[ 7] = k_madd_epi32(u[ 7], k32_p11_p21); 1.2413 + v[ 8] = k_madd_epi32(u[ 8], k32_p19_p13); 1.2414 + v[ 9] = k_madd_epi32(u[ 9], k32_p19_p13); 1.2415 + v[10] = k_madd_epi32(u[10], k32_p19_p13); 1.2416 + v[11] = k_madd_epi32(u[11], k32_p19_p13); 1.2417 + v[12] = k_madd_epi32(u[12], k32_p03_p29); 1.2418 + v[13] = k_madd_epi32(u[13], k32_p03_p29); 1.2419 + v[14] = k_madd_epi32(u[14], k32_p03_p29); 1.2420 + v[15] = k_madd_epi32(u[15], k32_p03_p29); 1.2421 + v[16] = k_madd_epi32(u[12], k32_m29_p03); 1.2422 + v[17] = k_madd_epi32(u[13], k32_m29_p03); 1.2423 + v[18] = k_madd_epi32(u[14], k32_m29_p03); 1.2424 + v[19] = k_madd_epi32(u[15], k32_m29_p03); 1.2425 + v[20] = k_madd_epi32(u[ 8], k32_m13_p19); 1.2426 + v[21] = k_madd_epi32(u[ 9], k32_m13_p19); 1.2427 + v[22] = k_madd_epi32(u[10], k32_m13_p19); 1.2428 + v[23] = k_madd_epi32(u[11], k32_m13_p19); 1.2429 + v[24] = k_madd_epi32(u[ 4], k32_m21_p11); 1.2430 + v[25] = k_madd_epi32(u[ 5], k32_m21_p11); 1.2431 + v[26] = k_madd_epi32(u[ 6], k32_m21_p11); 1.2432 + v[27] = k_madd_epi32(u[ 7], k32_m21_p11); 1.2433 + v[28] = k_madd_epi32(u[ 0], k32_m05_p27); 1.2434 + v[29] = k_madd_epi32(u[ 1], k32_m05_p27); 1.2435 + v[30] = k_madd_epi32(u[ 2], k32_m05_p27); 1.2436 + v[31] = k_madd_epi32(u[ 3], k32_m05_p27); 1.2437 + 1.2438 + u[ 0] = k_packs_epi64(v[ 0], v[ 1]); 1.2439 + u[ 1] = k_packs_epi64(v[ 2], v[ 3]); 1.2440 + u[ 2] = k_packs_epi64(v[ 4], v[ 5]); 1.2441 + u[ 3] = k_packs_epi64(v[ 6], v[ 7]); 1.2442 + u[ 4] = k_packs_epi64(v[ 8], v[ 9]); 1.2443 + u[ 5] = k_packs_epi64(v[10], v[11]); 1.2444 + u[ 6] = k_packs_epi64(v[12], v[13]); 1.2445 + u[ 7] = k_packs_epi64(v[14], v[15]); 1.2446 + u[ 8] = k_packs_epi64(v[16], v[17]); 1.2447 + u[ 9] = k_packs_epi64(v[18], v[19]); 1.2448 + u[10] = k_packs_epi64(v[20], v[21]); 1.2449 + u[11] = k_packs_epi64(v[22], v[23]); 1.2450 + u[12] = k_packs_epi64(v[24], v[25]); 1.2451 + u[13] = k_packs_epi64(v[26], v[27]); 1.2452 + u[14] = k_packs_epi64(v[28], v[29]); 1.2453 + u[15] = k_packs_epi64(v[30], v[31]); 1.2454 + 1.2455 + v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); 1.2456 + v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); 1.2457 + v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); 1.2458 + v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); 1.2459 + v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); 1.2460 + v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); 1.2461 + v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); 1.2462 + v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); 1.2463 + v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); 1.2464 + v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); 1.2465 + v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); 1.2466 + v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); 1.2467 + v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); 1.2468 + v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); 1.2469 + v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); 1.2470 + v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); 1.2471 + 1.2472 + u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); 1.2473 + u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); 1.2474 + u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); 1.2475 + u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); 1.2476 + u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); 1.2477 + u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); 1.2478 + u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); 1.2479 + u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); 1.2480 + u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); 1.2481 + u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); 1.2482 + u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); 1.2483 + u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); 1.2484 + u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); 1.2485 + u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); 1.2486 + u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); 1.2487 + u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); 1.2488 + 1.2489 + v[ 0] = _mm_cmplt_epi32(u[ 0], kZero); 1.2490 + v[ 1] = _mm_cmplt_epi32(u[ 1], kZero); 1.2491 + v[ 2] = _mm_cmplt_epi32(u[ 2], kZero); 1.2492 + v[ 3] = _mm_cmplt_epi32(u[ 3], kZero); 1.2493 + v[ 4] = _mm_cmplt_epi32(u[ 4], kZero); 1.2494 + v[ 5] = _mm_cmplt_epi32(u[ 5], kZero); 1.2495 + v[ 6] = _mm_cmplt_epi32(u[ 6], kZero); 1.2496 + v[ 7] = _mm_cmplt_epi32(u[ 7], kZero); 1.2497 + v[ 8] = _mm_cmplt_epi32(u[ 8], kZero); 1.2498 + v[ 9] = _mm_cmplt_epi32(u[ 9], kZero); 1.2499 + v[10] = _mm_cmplt_epi32(u[10], kZero); 1.2500 + v[11] = _mm_cmplt_epi32(u[11], kZero); 1.2501 + v[12] = _mm_cmplt_epi32(u[12], kZero); 1.2502 + v[13] = _mm_cmplt_epi32(u[13], kZero); 1.2503 + v[14] = _mm_cmplt_epi32(u[14], kZero); 1.2504 + v[15] = _mm_cmplt_epi32(u[15], kZero); 1.2505 + 1.2506 + u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]); 1.2507 + u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]); 1.2508 + u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]); 1.2509 + u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]); 1.2510 + u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]); 1.2511 + u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]); 1.2512 + u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]); 1.2513 + u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]); 1.2514 + u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]); 1.2515 + u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]); 1.2516 + u[10] = _mm_sub_epi32(u[10], v[10]); 1.2517 + u[11] = _mm_sub_epi32(u[11], v[11]); 1.2518 + u[12] = _mm_sub_epi32(u[12], v[12]); 1.2519 + u[13] = _mm_sub_epi32(u[13], v[13]); 1.2520 + u[14] = _mm_sub_epi32(u[14], v[14]); 1.2521 + u[15] = _mm_sub_epi32(u[15], v[15]); 1.2522 + 1.2523 + v[0] = _mm_add_epi32(u[0], K32One); 1.2524 + v[1] = _mm_add_epi32(u[1], K32One); 1.2525 + v[2] = _mm_add_epi32(u[2], K32One); 1.2526 + v[3] = _mm_add_epi32(u[3], K32One); 1.2527 + v[4] = _mm_add_epi32(u[4], K32One); 1.2528 + v[5] = _mm_add_epi32(u[5], K32One); 1.2529 + v[6] = _mm_add_epi32(u[6], K32One); 1.2530 + v[7] = _mm_add_epi32(u[7], K32One); 1.2531 + v[8] = _mm_add_epi32(u[8], K32One); 1.2532 + v[9] = _mm_add_epi32(u[9], K32One); 1.2533 + v[10] = _mm_add_epi32(u[10], K32One); 1.2534 + v[11] = _mm_add_epi32(u[11], K32One); 1.2535 + v[12] = _mm_add_epi32(u[12], K32One); 1.2536 + v[13] = _mm_add_epi32(u[13], K32One); 1.2537 + v[14] = _mm_add_epi32(u[14], K32One); 1.2538 + v[15] = _mm_add_epi32(u[15], K32One); 1.2539 + 1.2540 + u[0] = _mm_srai_epi32(v[0], 2); 1.2541 + u[1] = _mm_srai_epi32(v[1], 2); 1.2542 + u[2] = _mm_srai_epi32(v[2], 2); 1.2543 + u[3] = _mm_srai_epi32(v[3], 2); 1.2544 + u[4] = _mm_srai_epi32(v[4], 2); 1.2545 + u[5] = _mm_srai_epi32(v[5], 2); 1.2546 + u[6] = _mm_srai_epi32(v[6], 2); 1.2547 + u[7] = _mm_srai_epi32(v[7], 2); 1.2548 + u[8] = _mm_srai_epi32(v[8], 2); 1.2549 + u[9] = _mm_srai_epi32(v[9], 2); 1.2550 + u[10] = _mm_srai_epi32(v[10], 2); 1.2551 + u[11] = _mm_srai_epi32(v[11], 2); 1.2552 + u[12] = _mm_srai_epi32(v[12], 2); 1.2553 + u[13] = _mm_srai_epi32(v[13], 2); 1.2554 + u[14] = _mm_srai_epi32(v[14], 2); 1.2555 + u[15] = _mm_srai_epi32(v[15], 2); 1.2556 + 1.2557 + out[ 5] = _mm_packs_epi32(u[0], u[1]); 1.2558 + out[21] = _mm_packs_epi32(u[2], u[3]); 1.2559 + out[13] = _mm_packs_epi32(u[4], u[5]); 1.2560 + out[29] = _mm_packs_epi32(u[6], u[7]); 1.2561 + out[ 3] = _mm_packs_epi32(u[8], u[9]); 1.2562 + out[19] = _mm_packs_epi32(u[10], u[11]); 1.2563 + out[11] = _mm_packs_epi32(u[12], u[13]); 1.2564 + out[27] = _mm_packs_epi32(u[14], u[15]); 1.2565 + } 1.2566 + } 1.2567 +#endif 1.2568 + // Transpose the results, do it as four 8x8 transposes. 1.2569 + { 1.2570 + int transpose_block; 1.2571 + int16_t *output; 1.2572 + if (0 == pass) { 1.2573 + output = &intermediate[column_start * 32]; 1.2574 + } else { 1.2575 + output = &output_org[column_start * 32]; 1.2576 + } 1.2577 + for (transpose_block = 0; transpose_block < 4; ++transpose_block) { 1.2578 + __m128i *this_out = &out[8 * transpose_block]; 1.2579 + // 00 01 02 03 04 05 06 07 1.2580 + // 10 11 12 13 14 15 16 17 1.2581 + // 20 21 22 23 24 25 26 27 1.2582 + // 30 31 32 33 34 35 36 37 1.2583 + // 40 41 42 43 44 45 46 47 1.2584 + // 50 51 52 53 54 55 56 57 1.2585 + // 60 61 62 63 64 65 66 67 1.2586 + // 70 71 72 73 74 75 76 77 1.2587 + const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]); 1.2588 + const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]); 1.2589 + const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]); 1.2590 + const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]); 1.2591 + const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]); 1.2592 + const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]); 1.2593 + const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]); 1.2594 + const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]); 1.2595 + // 00 10 01 11 02 12 03 13 1.2596 + // 20 30 21 31 22 32 23 33 1.2597 + // 04 14 05 15 06 16 07 17 1.2598 + // 24 34 25 35 26 36 27 37 1.2599 + // 40 50 41 51 42 52 43 53 1.2600 + // 60 70 61 71 62 72 63 73 1.2601 + // 54 54 55 55 56 56 57 57 1.2602 + // 64 74 65 75 66 76 67 77 1.2603 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); 1.2604 + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); 1.2605 + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); 1.2606 + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); 1.2607 + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); 1.2608 + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); 1.2609 + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); 1.2610 + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); 1.2611 + // 00 10 20 30 01 11 21 31 1.2612 + // 40 50 60 70 41 51 61 71 1.2613 + // 02 12 22 32 03 13 23 33 1.2614 + // 42 52 62 72 43 53 63 73 1.2615 + // 04 14 24 34 05 15 21 36 1.2616 + // 44 54 64 74 45 55 61 76 1.2617 + // 06 16 26 36 07 17 27 37 1.2618 + // 46 56 66 76 47 57 67 77 1.2619 + __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); 1.2620 + __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); 1.2621 + __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); 1.2622 + __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); 1.2623 + __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); 1.2624 + __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); 1.2625 + __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); 1.2626 + __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); 1.2627 + // 00 10 20 30 40 50 60 70 1.2628 + // 01 11 21 31 41 51 61 71 1.2629 + // 02 12 22 32 42 52 62 72 1.2630 + // 03 13 23 33 43 53 63 73 1.2631 + // 04 14 24 34 44 54 64 74 1.2632 + // 05 15 25 35 45 55 65 75 1.2633 + // 06 16 26 36 46 56 66 76 1.2634 + // 07 17 27 37 47 57 67 77 1.2635 + if (0 == pass) { 1.2636 + // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2; 1.2637 + // TODO(cd): see quality impact of only doing 1.2638 + // output[j] = (output[j] + 1) >> 2; 1.2639 + // which would remove the code between here ... 1.2640 + __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero); 1.2641 + __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero); 1.2642 + __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero); 1.2643 + __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero); 1.2644 + __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero); 1.2645 + __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero); 1.2646 + __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero); 1.2647 + __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero); 1.2648 + tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0); 1.2649 + tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0); 1.2650 + tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0); 1.2651 + tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0); 1.2652 + tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0); 1.2653 + tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0); 1.2654 + tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0); 1.2655 + tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0); 1.2656 + // ... and here. 1.2657 + // PS: also change code in vp9/encoder/vp9_dct.c 1.2658 + tr2_0 = _mm_add_epi16(tr2_0, kOne); 1.2659 + tr2_1 = _mm_add_epi16(tr2_1, kOne); 1.2660 + tr2_2 = _mm_add_epi16(tr2_2, kOne); 1.2661 + tr2_3 = _mm_add_epi16(tr2_3, kOne); 1.2662 + tr2_4 = _mm_add_epi16(tr2_4, kOne); 1.2663 + tr2_5 = _mm_add_epi16(tr2_5, kOne); 1.2664 + tr2_6 = _mm_add_epi16(tr2_6, kOne); 1.2665 + tr2_7 = _mm_add_epi16(tr2_7, kOne); 1.2666 + tr2_0 = _mm_srai_epi16(tr2_0, 2); 1.2667 + tr2_1 = _mm_srai_epi16(tr2_1, 2); 1.2668 + tr2_2 = _mm_srai_epi16(tr2_2, 2); 1.2669 + tr2_3 = _mm_srai_epi16(tr2_3, 2); 1.2670 + tr2_4 = _mm_srai_epi16(tr2_4, 2); 1.2671 + tr2_5 = _mm_srai_epi16(tr2_5, 2); 1.2672 + tr2_6 = _mm_srai_epi16(tr2_6, 2); 1.2673 + tr2_7 = _mm_srai_epi16(tr2_7, 2); 1.2674 + } 1.2675 + // Note: even though all these stores are aligned, using the aligned 1.2676 + // intrinsic make the code slightly slower. 1.2677 + _mm_storeu_si128((__m128i *)(output + 0 * 32), tr2_0); 1.2678 + _mm_storeu_si128((__m128i *)(output + 1 * 32), tr2_1); 1.2679 + _mm_storeu_si128((__m128i *)(output + 2 * 32), tr2_2); 1.2680 + _mm_storeu_si128((__m128i *)(output + 3 * 32), tr2_3); 1.2681 + _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4); 1.2682 + _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5); 1.2683 + _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6); 1.2684 + _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7); 1.2685 + // Process next 8x8 1.2686 + output += 8; 1.2687 + } 1.2688 + } 1.2689 + } 1.2690 + } 1.2691 +} // NOLINT