media/libvpx/vp9/encoder/x86/vp9_dct32x32_sse2.c

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
michael@0 3 *
michael@0 4 * Use of this source code is governed by a BSD-style license
michael@0 5 * that can be found in the LICENSE file in the root of the source
michael@0 6 * tree. An additional intellectual property rights grant can be found
michael@0 7 * in the file PATENTS. All contributing project authors may
michael@0 8 * be found in the AUTHORS file in the root of the source tree.
michael@0 9 */
michael@0 10
michael@0 11 #include <emmintrin.h> // SSE2
michael@0 12 #include "vp9/common/vp9_idct.h" // for cospi constants
michael@0 13 #include "vpx_ports/mem.h"
michael@0 14
michael@0 15 #if FDCT32x32_HIGH_PRECISION
michael@0 16 static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
michael@0 17 __m128i buf0, buf1;
michael@0 18 buf0 = _mm_mul_epu32(a, b);
michael@0 19 a = _mm_srli_epi64(a, 32);
michael@0 20 b = _mm_srli_epi64(b, 32);
michael@0 21 buf1 = _mm_mul_epu32(a, b);
michael@0 22 return _mm_add_epi64(buf0, buf1);
michael@0 23 }
michael@0 24
michael@0 25 static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
michael@0 26 __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
michael@0 27 __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
michael@0 28 return _mm_unpacklo_epi64(buf0, buf1);
michael@0 29 }
michael@0 30 #endif
michael@0 31
michael@0 32 void FDCT32x32_2D(const int16_t *input,
michael@0 33 int16_t *output_org, int stride) {
michael@0 34 // Calculate pre-multiplied strides
michael@0 35 const int str1 = stride;
michael@0 36 const int str2 = 2 * stride;
michael@0 37 const int str3 = 2 * stride + str1;
michael@0 38 // We need an intermediate buffer between passes.
michael@0 39 DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]);
michael@0 40 // Constants
michael@0 41 // When we use them, in one case, they are all the same. In all others
michael@0 42 // it's a pair of them that we need to repeat four times. This is done
michael@0 43 // by constructing the 32 bit constant corresponding to that pair.
michael@0 44 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64);
michael@0 45 const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
michael@0 46 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
michael@0 47 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
michael@0 48 const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64);
michael@0 49 const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64);
michael@0 50 const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
michael@0 51 const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
michael@0 52 const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64);
michael@0 53 const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
michael@0 54 const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
michael@0 55 const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64);
michael@0 56 const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64);
michael@0 57 const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64);
michael@0 58 const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64);
michael@0 59 const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
michael@0 60 const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
michael@0 61 const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
michael@0 62 const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
michael@0 63 const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64);
michael@0 64 const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64);
michael@0 65 const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64);
michael@0 66 const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64);
michael@0 67 const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64);
michael@0 68 const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64);
michael@0 69 const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64);
michael@0 70 const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64);
michael@0 71 const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64);
michael@0 72 const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64);
michael@0 73 const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64);
michael@0 74 const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64);
michael@0 75 const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64);
michael@0 76 const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64);
michael@0 77 const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
michael@0 78 const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
michael@0 79 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
michael@0 80 const __m128i kZero = _mm_set1_epi16(0);
michael@0 81 const __m128i kOne = _mm_set1_epi16(1);
michael@0 82 // Do the two transform/transpose passes
michael@0 83 int pass;
michael@0 84 for (pass = 0; pass < 2; ++pass) {
michael@0 85 // We process eight columns (transposed rows in second pass) at a time.
michael@0 86 int column_start;
michael@0 87 for (column_start = 0; column_start < 32; column_start += 8) {
michael@0 88 __m128i step1[32];
michael@0 89 __m128i step2[32];
michael@0 90 __m128i step3[32];
michael@0 91 __m128i out[32];
michael@0 92 // Stage 1
michael@0 93 // Note: even though all the loads below are aligned, using the aligned
michael@0 94 // intrinsic make the code slightly slower.
michael@0 95 if (0 == pass) {
michael@0 96 const int16_t *in = &input[column_start];
michael@0 97 // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2;
michael@0 98 // Note: the next four blocks could be in a loop. That would help the
michael@0 99 // instruction cache but is actually slower.
michael@0 100 {
michael@0 101 const int16_t *ina = in + 0 * str1;
michael@0 102 const int16_t *inb = in + 31 * str1;
michael@0 103 __m128i *step1a = &step1[ 0];
michael@0 104 __m128i *step1b = &step1[31];
michael@0 105 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
michael@0 106 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
michael@0 107 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
michael@0 108 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
michael@0 109 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
michael@0 110 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
michael@0 111 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
michael@0 112 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
michael@0 113 step1a[ 0] = _mm_add_epi16(ina0, inb0);
michael@0 114 step1a[ 1] = _mm_add_epi16(ina1, inb1);
michael@0 115 step1a[ 2] = _mm_add_epi16(ina2, inb2);
michael@0 116 step1a[ 3] = _mm_add_epi16(ina3, inb3);
michael@0 117 step1b[-3] = _mm_sub_epi16(ina3, inb3);
michael@0 118 step1b[-2] = _mm_sub_epi16(ina2, inb2);
michael@0 119 step1b[-1] = _mm_sub_epi16(ina1, inb1);
michael@0 120 step1b[-0] = _mm_sub_epi16(ina0, inb0);
michael@0 121 step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
michael@0 122 step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
michael@0 123 step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
michael@0 124 step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
michael@0 125 step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
michael@0 126 step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
michael@0 127 step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
michael@0 128 step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
michael@0 129 }
michael@0 130 {
michael@0 131 const int16_t *ina = in + 4 * str1;
michael@0 132 const int16_t *inb = in + 27 * str1;
michael@0 133 __m128i *step1a = &step1[ 4];
michael@0 134 __m128i *step1b = &step1[27];
michael@0 135 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
michael@0 136 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
michael@0 137 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
michael@0 138 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
michael@0 139 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
michael@0 140 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
michael@0 141 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
michael@0 142 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
michael@0 143 step1a[ 0] = _mm_add_epi16(ina0, inb0);
michael@0 144 step1a[ 1] = _mm_add_epi16(ina1, inb1);
michael@0 145 step1a[ 2] = _mm_add_epi16(ina2, inb2);
michael@0 146 step1a[ 3] = _mm_add_epi16(ina3, inb3);
michael@0 147 step1b[-3] = _mm_sub_epi16(ina3, inb3);
michael@0 148 step1b[-2] = _mm_sub_epi16(ina2, inb2);
michael@0 149 step1b[-1] = _mm_sub_epi16(ina1, inb1);
michael@0 150 step1b[-0] = _mm_sub_epi16(ina0, inb0);
michael@0 151 step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
michael@0 152 step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
michael@0 153 step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
michael@0 154 step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
michael@0 155 step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
michael@0 156 step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
michael@0 157 step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
michael@0 158 step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
michael@0 159 }
michael@0 160 {
michael@0 161 const int16_t *ina = in + 8 * str1;
michael@0 162 const int16_t *inb = in + 23 * str1;
michael@0 163 __m128i *step1a = &step1[ 8];
michael@0 164 __m128i *step1b = &step1[23];
michael@0 165 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
michael@0 166 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
michael@0 167 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
michael@0 168 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
michael@0 169 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
michael@0 170 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
michael@0 171 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
michael@0 172 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
michael@0 173 step1a[ 0] = _mm_add_epi16(ina0, inb0);
michael@0 174 step1a[ 1] = _mm_add_epi16(ina1, inb1);
michael@0 175 step1a[ 2] = _mm_add_epi16(ina2, inb2);
michael@0 176 step1a[ 3] = _mm_add_epi16(ina3, inb3);
michael@0 177 step1b[-3] = _mm_sub_epi16(ina3, inb3);
michael@0 178 step1b[-2] = _mm_sub_epi16(ina2, inb2);
michael@0 179 step1b[-1] = _mm_sub_epi16(ina1, inb1);
michael@0 180 step1b[-0] = _mm_sub_epi16(ina0, inb0);
michael@0 181 step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
michael@0 182 step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
michael@0 183 step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
michael@0 184 step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
michael@0 185 step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
michael@0 186 step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
michael@0 187 step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
michael@0 188 step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
michael@0 189 }
michael@0 190 {
michael@0 191 const int16_t *ina = in + 12 * str1;
michael@0 192 const int16_t *inb = in + 19 * str1;
michael@0 193 __m128i *step1a = &step1[12];
michael@0 194 __m128i *step1b = &step1[19];
michael@0 195 const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina));
michael@0 196 const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1));
michael@0 197 const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2));
michael@0 198 const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3));
michael@0 199 const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3));
michael@0 200 const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2));
michael@0 201 const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1));
michael@0 202 const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb));
michael@0 203 step1a[ 0] = _mm_add_epi16(ina0, inb0);
michael@0 204 step1a[ 1] = _mm_add_epi16(ina1, inb1);
michael@0 205 step1a[ 2] = _mm_add_epi16(ina2, inb2);
michael@0 206 step1a[ 3] = _mm_add_epi16(ina3, inb3);
michael@0 207 step1b[-3] = _mm_sub_epi16(ina3, inb3);
michael@0 208 step1b[-2] = _mm_sub_epi16(ina2, inb2);
michael@0 209 step1b[-1] = _mm_sub_epi16(ina1, inb1);
michael@0 210 step1b[-0] = _mm_sub_epi16(ina0, inb0);
michael@0 211 step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2);
michael@0 212 step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2);
michael@0 213 step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2);
michael@0 214 step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2);
michael@0 215 step1b[-3] = _mm_slli_epi16(step1b[-3], 2);
michael@0 216 step1b[-2] = _mm_slli_epi16(step1b[-2], 2);
michael@0 217 step1b[-1] = _mm_slli_epi16(step1b[-1], 2);
michael@0 218 step1b[-0] = _mm_slli_epi16(step1b[-0], 2);
michael@0 219 }
michael@0 220 } else {
michael@0 221 int16_t *in = &intermediate[column_start];
michael@0 222 // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32];
michael@0 223 // Note: using the same approach as above to have common offset is
michael@0 224 // counter-productive as all offsets can be calculated at compile
michael@0 225 // time.
michael@0 226 // Note: the next four blocks could be in a loop. That would help the
michael@0 227 // instruction cache but is actually slower.
michael@0 228 {
michael@0 229 __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32));
michael@0 230 __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32));
michael@0 231 __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32));
michael@0 232 __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32));
michael@0 233 __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32));
michael@0 234 __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32));
michael@0 235 __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32));
michael@0 236 __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32));
michael@0 237 step1[ 0] = _mm_add_epi16(in00, in31);
michael@0 238 step1[ 1] = _mm_add_epi16(in01, in30);
michael@0 239 step1[ 2] = _mm_add_epi16(in02, in29);
michael@0 240 step1[ 3] = _mm_add_epi16(in03, in28);
michael@0 241 step1[28] = _mm_sub_epi16(in03, in28);
michael@0 242 step1[29] = _mm_sub_epi16(in02, in29);
michael@0 243 step1[30] = _mm_sub_epi16(in01, in30);
michael@0 244 step1[31] = _mm_sub_epi16(in00, in31);
michael@0 245 }
michael@0 246 {
michael@0 247 __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32));
michael@0 248 __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32));
michael@0 249 __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32));
michael@0 250 __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32));
michael@0 251 __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32));
michael@0 252 __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32));
michael@0 253 __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32));
michael@0 254 __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32));
michael@0 255 step1[ 4] = _mm_add_epi16(in04, in27);
michael@0 256 step1[ 5] = _mm_add_epi16(in05, in26);
michael@0 257 step1[ 6] = _mm_add_epi16(in06, in25);
michael@0 258 step1[ 7] = _mm_add_epi16(in07, in24);
michael@0 259 step1[24] = _mm_sub_epi16(in07, in24);
michael@0 260 step1[25] = _mm_sub_epi16(in06, in25);
michael@0 261 step1[26] = _mm_sub_epi16(in05, in26);
michael@0 262 step1[27] = _mm_sub_epi16(in04, in27);
michael@0 263 }
michael@0 264 {
michael@0 265 __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32));
michael@0 266 __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32));
michael@0 267 __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32));
michael@0 268 __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32));
michael@0 269 __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32));
michael@0 270 __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32));
michael@0 271 __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32));
michael@0 272 __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32));
michael@0 273 step1[ 8] = _mm_add_epi16(in08, in23);
michael@0 274 step1[ 9] = _mm_add_epi16(in09, in22);
michael@0 275 step1[10] = _mm_add_epi16(in10, in21);
michael@0 276 step1[11] = _mm_add_epi16(in11, in20);
michael@0 277 step1[20] = _mm_sub_epi16(in11, in20);
michael@0 278 step1[21] = _mm_sub_epi16(in10, in21);
michael@0 279 step1[22] = _mm_sub_epi16(in09, in22);
michael@0 280 step1[23] = _mm_sub_epi16(in08, in23);
michael@0 281 }
michael@0 282 {
michael@0 283 __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32));
michael@0 284 __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32));
michael@0 285 __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32));
michael@0 286 __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32));
michael@0 287 __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32));
michael@0 288 __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32));
michael@0 289 __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32));
michael@0 290 __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32));
michael@0 291 step1[12] = _mm_add_epi16(in12, in19);
michael@0 292 step1[13] = _mm_add_epi16(in13, in18);
michael@0 293 step1[14] = _mm_add_epi16(in14, in17);
michael@0 294 step1[15] = _mm_add_epi16(in15, in16);
michael@0 295 step1[16] = _mm_sub_epi16(in15, in16);
michael@0 296 step1[17] = _mm_sub_epi16(in14, in17);
michael@0 297 step1[18] = _mm_sub_epi16(in13, in18);
michael@0 298 step1[19] = _mm_sub_epi16(in12, in19);
michael@0 299 }
michael@0 300 }
michael@0 301 // Stage 2
michael@0 302 {
michael@0 303 step2[ 0] = _mm_add_epi16(step1[0], step1[15]);
michael@0 304 step2[ 1] = _mm_add_epi16(step1[1], step1[14]);
michael@0 305 step2[ 2] = _mm_add_epi16(step1[2], step1[13]);
michael@0 306 step2[ 3] = _mm_add_epi16(step1[3], step1[12]);
michael@0 307 step2[ 4] = _mm_add_epi16(step1[4], step1[11]);
michael@0 308 step2[ 5] = _mm_add_epi16(step1[5], step1[10]);
michael@0 309 step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]);
michael@0 310 step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]);
michael@0 311 step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]);
michael@0 312 step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]);
michael@0 313 step2[10] = _mm_sub_epi16(step1[5], step1[10]);
michael@0 314 step2[11] = _mm_sub_epi16(step1[4], step1[11]);
michael@0 315 step2[12] = _mm_sub_epi16(step1[3], step1[12]);
michael@0 316 step2[13] = _mm_sub_epi16(step1[2], step1[13]);
michael@0 317 step2[14] = _mm_sub_epi16(step1[1], step1[14]);
michael@0 318 step2[15] = _mm_sub_epi16(step1[0], step1[15]);
michael@0 319 }
michael@0 320 {
michael@0 321 const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]);
michael@0 322 const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]);
michael@0 323 const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]);
michael@0 324 const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]);
michael@0 325 const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]);
michael@0 326 const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]);
michael@0 327 const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]);
michael@0 328 const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]);
michael@0 329 const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16);
michael@0 330 const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16);
michael@0 331 const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16);
michael@0 332 const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16);
michael@0 333 const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16);
michael@0 334 const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16);
michael@0 335 const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16);
michael@0 336 const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16);
michael@0 337 const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16);
michael@0 338 const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16);
michael@0 339 const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16);
michael@0 340 const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16);
michael@0 341 const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16);
michael@0 342 const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16);
michael@0 343 const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16);
michael@0 344 const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16);
michael@0 345 // dct_const_round_shift
michael@0 346 const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING);
michael@0 347 const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING);
michael@0 348 const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING);
michael@0 349 const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING);
michael@0 350 const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING);
michael@0 351 const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING);
michael@0 352 const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING);
michael@0 353 const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING);
michael@0 354 const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING);
michael@0 355 const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING);
michael@0 356 const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING);
michael@0 357 const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING);
michael@0 358 const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING);
michael@0 359 const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING);
michael@0 360 const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING);
michael@0 361 const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING);
michael@0 362 const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS);
michael@0 363 const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS);
michael@0 364 const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS);
michael@0 365 const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS);
michael@0 366 const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS);
michael@0 367 const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS);
michael@0 368 const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS);
michael@0 369 const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS);
michael@0 370 const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS);
michael@0 371 const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS);
michael@0 372 const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS);
michael@0 373 const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS);
michael@0 374 const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS);
michael@0 375 const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS);
michael@0 376 const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS);
michael@0 377 const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS);
michael@0 378 // Combine
michael@0 379 step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7);
michael@0 380 step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7);
michael@0 381 step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7);
michael@0 382 step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7);
michael@0 383 step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7);
michael@0 384 step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7);
michael@0 385 step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7);
michael@0 386 step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7);
michael@0 387 }
michael@0 388
michael@0 389 #if !FDCT32x32_HIGH_PRECISION
michael@0 390 // dump the magnitude by half, hence the intermediate values are within
michael@0 391 // the range of 16 bits.
michael@0 392 if (1 == pass) {
michael@0 393 __m128i s3_00_0 = _mm_cmplt_epi16(step2[ 0], kZero);
michael@0 394 __m128i s3_01_0 = _mm_cmplt_epi16(step2[ 1], kZero);
michael@0 395 __m128i s3_02_0 = _mm_cmplt_epi16(step2[ 2], kZero);
michael@0 396 __m128i s3_03_0 = _mm_cmplt_epi16(step2[ 3], kZero);
michael@0 397 __m128i s3_04_0 = _mm_cmplt_epi16(step2[ 4], kZero);
michael@0 398 __m128i s3_05_0 = _mm_cmplt_epi16(step2[ 5], kZero);
michael@0 399 __m128i s3_06_0 = _mm_cmplt_epi16(step2[ 6], kZero);
michael@0 400 __m128i s3_07_0 = _mm_cmplt_epi16(step2[ 7], kZero);
michael@0 401 __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero);
michael@0 402 __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero);
michael@0 403 __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero);
michael@0 404 __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero);
michael@0 405 __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero);
michael@0 406 __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero);
michael@0 407 __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero);
michael@0 408 __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero);
michael@0 409 __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero);
michael@0 410 __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero);
michael@0 411 __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero);
michael@0 412 __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero);
michael@0 413 __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero);
michael@0 414 __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero);
michael@0 415 __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero);
michael@0 416 __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero);
michael@0 417 __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero);
michael@0 418 __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero);
michael@0 419 __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero);
michael@0 420 __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero);
michael@0 421 __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero);
michael@0 422 __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero);
michael@0 423 __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero);
michael@0 424 __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero);
michael@0 425
michael@0 426 step2[ 0] = _mm_sub_epi16(step2[ 0], s3_00_0);
michael@0 427 step2[ 1] = _mm_sub_epi16(step2[ 1], s3_01_0);
michael@0 428 step2[ 2] = _mm_sub_epi16(step2[ 2], s3_02_0);
michael@0 429 step2[ 3] = _mm_sub_epi16(step2[ 3], s3_03_0);
michael@0 430 step2[ 4] = _mm_sub_epi16(step2[ 4], s3_04_0);
michael@0 431 step2[ 5] = _mm_sub_epi16(step2[ 5], s3_05_0);
michael@0 432 step2[ 6] = _mm_sub_epi16(step2[ 6], s3_06_0);
michael@0 433 step2[ 7] = _mm_sub_epi16(step2[ 7], s3_07_0);
michael@0 434 step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0);
michael@0 435 step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0);
michael@0 436 step2[10] = _mm_sub_epi16(step2[10], s3_10_0);
michael@0 437 step2[11] = _mm_sub_epi16(step2[11], s3_11_0);
michael@0 438 step2[12] = _mm_sub_epi16(step2[12], s3_12_0);
michael@0 439 step2[13] = _mm_sub_epi16(step2[13], s3_13_0);
michael@0 440 step2[14] = _mm_sub_epi16(step2[14], s2_14_0);
michael@0 441 step2[15] = _mm_sub_epi16(step2[15], s2_15_0);
michael@0 442 step1[16] = _mm_sub_epi16(step1[16], s3_16_0);
michael@0 443 step1[17] = _mm_sub_epi16(step1[17], s3_17_0);
michael@0 444 step1[18] = _mm_sub_epi16(step1[18], s3_18_0);
michael@0 445 step1[19] = _mm_sub_epi16(step1[19], s3_19_0);
michael@0 446 step2[20] = _mm_sub_epi16(step2[20], s3_20_0);
michael@0 447 step2[21] = _mm_sub_epi16(step2[21], s3_21_0);
michael@0 448 step2[22] = _mm_sub_epi16(step2[22], s3_22_0);
michael@0 449 step2[23] = _mm_sub_epi16(step2[23], s3_23_0);
michael@0 450 step2[24] = _mm_sub_epi16(step2[24], s3_24_0);
michael@0 451 step2[25] = _mm_sub_epi16(step2[25], s3_25_0);
michael@0 452 step2[26] = _mm_sub_epi16(step2[26], s3_26_0);
michael@0 453 step2[27] = _mm_sub_epi16(step2[27], s3_27_0);
michael@0 454 step1[28] = _mm_sub_epi16(step1[28], s3_28_0);
michael@0 455 step1[29] = _mm_sub_epi16(step1[29], s3_29_0);
michael@0 456 step1[30] = _mm_sub_epi16(step1[30], s3_30_0);
michael@0 457 step1[31] = _mm_sub_epi16(step1[31], s3_31_0);
michael@0 458
michael@0 459 step2[ 0] = _mm_add_epi16(step2[ 0], kOne);
michael@0 460 step2[ 1] = _mm_add_epi16(step2[ 1], kOne);
michael@0 461 step2[ 2] = _mm_add_epi16(step2[ 2], kOne);
michael@0 462 step2[ 3] = _mm_add_epi16(step2[ 3], kOne);
michael@0 463 step2[ 4] = _mm_add_epi16(step2[ 4], kOne);
michael@0 464 step2[ 5] = _mm_add_epi16(step2[ 5], kOne);
michael@0 465 step2[ 6] = _mm_add_epi16(step2[ 6], kOne);
michael@0 466 step2[ 7] = _mm_add_epi16(step2[ 7], kOne);
michael@0 467 step2[ 8] = _mm_add_epi16(step2[ 8], kOne);
michael@0 468 step2[ 9] = _mm_add_epi16(step2[ 9], kOne);
michael@0 469 step2[10] = _mm_add_epi16(step2[10], kOne);
michael@0 470 step2[11] = _mm_add_epi16(step2[11], kOne);
michael@0 471 step2[12] = _mm_add_epi16(step2[12], kOne);
michael@0 472 step2[13] = _mm_add_epi16(step2[13], kOne);
michael@0 473 step2[14] = _mm_add_epi16(step2[14], kOne);
michael@0 474 step2[15] = _mm_add_epi16(step2[15], kOne);
michael@0 475 step1[16] = _mm_add_epi16(step1[16], kOne);
michael@0 476 step1[17] = _mm_add_epi16(step1[17], kOne);
michael@0 477 step1[18] = _mm_add_epi16(step1[18], kOne);
michael@0 478 step1[19] = _mm_add_epi16(step1[19], kOne);
michael@0 479 step2[20] = _mm_add_epi16(step2[20], kOne);
michael@0 480 step2[21] = _mm_add_epi16(step2[21], kOne);
michael@0 481 step2[22] = _mm_add_epi16(step2[22], kOne);
michael@0 482 step2[23] = _mm_add_epi16(step2[23], kOne);
michael@0 483 step2[24] = _mm_add_epi16(step2[24], kOne);
michael@0 484 step2[25] = _mm_add_epi16(step2[25], kOne);
michael@0 485 step2[26] = _mm_add_epi16(step2[26], kOne);
michael@0 486 step2[27] = _mm_add_epi16(step2[27], kOne);
michael@0 487 step1[28] = _mm_add_epi16(step1[28], kOne);
michael@0 488 step1[29] = _mm_add_epi16(step1[29], kOne);
michael@0 489 step1[30] = _mm_add_epi16(step1[30], kOne);
michael@0 490 step1[31] = _mm_add_epi16(step1[31], kOne);
michael@0 491
michael@0 492 step2[ 0] = _mm_srai_epi16(step2[ 0], 2);
michael@0 493 step2[ 1] = _mm_srai_epi16(step2[ 1], 2);
michael@0 494 step2[ 2] = _mm_srai_epi16(step2[ 2], 2);
michael@0 495 step2[ 3] = _mm_srai_epi16(step2[ 3], 2);
michael@0 496 step2[ 4] = _mm_srai_epi16(step2[ 4], 2);
michael@0 497 step2[ 5] = _mm_srai_epi16(step2[ 5], 2);
michael@0 498 step2[ 6] = _mm_srai_epi16(step2[ 6], 2);
michael@0 499 step2[ 7] = _mm_srai_epi16(step2[ 7], 2);
michael@0 500 step2[ 8] = _mm_srai_epi16(step2[ 8], 2);
michael@0 501 step2[ 9] = _mm_srai_epi16(step2[ 9], 2);
michael@0 502 step2[10] = _mm_srai_epi16(step2[10], 2);
michael@0 503 step2[11] = _mm_srai_epi16(step2[11], 2);
michael@0 504 step2[12] = _mm_srai_epi16(step2[12], 2);
michael@0 505 step2[13] = _mm_srai_epi16(step2[13], 2);
michael@0 506 step2[14] = _mm_srai_epi16(step2[14], 2);
michael@0 507 step2[15] = _mm_srai_epi16(step2[15], 2);
michael@0 508 step1[16] = _mm_srai_epi16(step1[16], 2);
michael@0 509 step1[17] = _mm_srai_epi16(step1[17], 2);
michael@0 510 step1[18] = _mm_srai_epi16(step1[18], 2);
michael@0 511 step1[19] = _mm_srai_epi16(step1[19], 2);
michael@0 512 step2[20] = _mm_srai_epi16(step2[20], 2);
michael@0 513 step2[21] = _mm_srai_epi16(step2[21], 2);
michael@0 514 step2[22] = _mm_srai_epi16(step2[22], 2);
michael@0 515 step2[23] = _mm_srai_epi16(step2[23], 2);
michael@0 516 step2[24] = _mm_srai_epi16(step2[24], 2);
michael@0 517 step2[25] = _mm_srai_epi16(step2[25], 2);
michael@0 518 step2[26] = _mm_srai_epi16(step2[26], 2);
michael@0 519 step2[27] = _mm_srai_epi16(step2[27], 2);
michael@0 520 step1[28] = _mm_srai_epi16(step1[28], 2);
michael@0 521 step1[29] = _mm_srai_epi16(step1[29], 2);
michael@0 522 step1[30] = _mm_srai_epi16(step1[30], 2);
michael@0 523 step1[31] = _mm_srai_epi16(step1[31], 2);
michael@0 524 }
michael@0 525 #endif
michael@0 526
michael@0 527 #if FDCT32x32_HIGH_PRECISION
michael@0 528 if (pass == 0) {
michael@0 529 #endif
michael@0 530 // Stage 3
michael@0 531 {
michael@0 532 step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]);
michael@0 533 step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]);
michael@0 534 step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]);
michael@0 535 step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]);
michael@0 536 step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]);
michael@0 537 step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]);
michael@0 538 step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]);
michael@0 539 step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]);
michael@0 540 }
michael@0 541 {
michael@0 542 const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
michael@0 543 const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
michael@0 544 const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
michael@0 545 const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
michael@0 546 const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
michael@0 547 const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
michael@0 548 const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
michael@0 549 const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
michael@0 550 const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
michael@0 551 const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
michael@0 552 const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
michael@0 553 const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
michael@0 554 // dct_const_round_shift
michael@0 555 const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
michael@0 556 const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
michael@0 557 const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
michael@0 558 const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
michael@0 559 const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
michael@0 560 const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
michael@0 561 const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
michael@0 562 const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
michael@0 563 const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
michael@0 564 const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
michael@0 565 const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
michael@0 566 const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
michael@0 567 const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
michael@0 568 const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
michael@0 569 const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
michael@0 570 const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
michael@0 571 // Combine
michael@0 572 step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7);
michael@0 573 step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7);
michael@0 574 step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7);
michael@0 575 step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7);
michael@0 576 }
michael@0 577 {
michael@0 578 step3[16] = _mm_add_epi16(step2[23], step1[16]);
michael@0 579 step3[17] = _mm_add_epi16(step2[22], step1[17]);
michael@0 580 step3[18] = _mm_add_epi16(step2[21], step1[18]);
michael@0 581 step3[19] = _mm_add_epi16(step2[20], step1[19]);
michael@0 582 step3[20] = _mm_sub_epi16(step1[19], step2[20]);
michael@0 583 step3[21] = _mm_sub_epi16(step1[18], step2[21]);
michael@0 584 step3[22] = _mm_sub_epi16(step1[17], step2[22]);
michael@0 585 step3[23] = _mm_sub_epi16(step1[16], step2[23]);
michael@0 586 step3[24] = _mm_sub_epi16(step1[31], step2[24]);
michael@0 587 step3[25] = _mm_sub_epi16(step1[30], step2[25]);
michael@0 588 step3[26] = _mm_sub_epi16(step1[29], step2[26]);
michael@0 589 step3[27] = _mm_sub_epi16(step1[28], step2[27]);
michael@0 590 step3[28] = _mm_add_epi16(step2[27], step1[28]);
michael@0 591 step3[29] = _mm_add_epi16(step2[26], step1[29]);
michael@0 592 step3[30] = _mm_add_epi16(step2[25], step1[30]);
michael@0 593 step3[31] = _mm_add_epi16(step2[24], step1[31]);
michael@0 594 }
michael@0 595
michael@0 596 // Stage 4
michael@0 597 {
michael@0 598 step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]);
michael@0 599 step1[ 1] = _mm_add_epi16(step3[ 2], step3[ 1]);
michael@0 600 step1[ 2] = _mm_sub_epi16(step3[ 1], step3[ 2]);
michael@0 601 step1[ 3] = _mm_sub_epi16(step3[ 0], step3[ 3]);
michael@0 602 step1[ 8] = _mm_add_epi16(step3[11], step2[ 8]);
michael@0 603 step1[ 9] = _mm_add_epi16(step3[10], step2[ 9]);
michael@0 604 step1[10] = _mm_sub_epi16(step2[ 9], step3[10]);
michael@0 605 step1[11] = _mm_sub_epi16(step2[ 8], step3[11]);
michael@0 606 step1[12] = _mm_sub_epi16(step2[15], step3[12]);
michael@0 607 step1[13] = _mm_sub_epi16(step2[14], step3[13]);
michael@0 608 step1[14] = _mm_add_epi16(step3[13], step2[14]);
michael@0 609 step1[15] = _mm_add_epi16(step3[12], step2[15]);
michael@0 610 }
michael@0 611 {
michael@0 612 const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]);
michael@0 613 const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]);
michael@0 614 const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16);
michael@0 615 const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16);
michael@0 616 const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16);
michael@0 617 const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16);
michael@0 618 // dct_const_round_shift
michael@0 619 const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING);
michael@0 620 const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING);
michael@0 621 const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING);
michael@0 622 const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING);
michael@0 623 const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS);
michael@0 624 const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS);
michael@0 625 const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS);
michael@0 626 const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS);
michael@0 627 // Combine
michael@0 628 step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7);
michael@0 629 step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7);
michael@0 630 }
michael@0 631 {
michael@0 632 const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]);
michael@0 633 const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]);
michael@0 634 const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]);
michael@0 635 const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]);
michael@0 636 const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]);
michael@0 637 const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]);
michael@0 638 const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]);
michael@0 639 const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]);
michael@0 640 const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24);
michael@0 641 const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24);
michael@0 642 const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24);
michael@0 643 const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24);
michael@0 644 const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08);
michael@0 645 const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08);
michael@0 646 const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08);
michael@0 647 const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08);
michael@0 648 const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24);
michael@0 649 const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24);
michael@0 650 const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24);
michael@0 651 const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24);
michael@0 652 const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08);
michael@0 653 const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08);
michael@0 654 const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08);
michael@0 655 const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08);
michael@0 656 // dct_const_round_shift
michael@0 657 const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING);
michael@0 658 const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING);
michael@0 659 const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING);
michael@0 660 const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING);
michael@0 661 const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING);
michael@0 662 const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING);
michael@0 663 const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING);
michael@0 664 const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING);
michael@0 665 const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING);
michael@0 666 const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING);
michael@0 667 const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING);
michael@0 668 const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING);
michael@0 669 const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING);
michael@0 670 const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING);
michael@0 671 const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING);
michael@0 672 const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING);
michael@0 673 const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS);
michael@0 674 const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS);
michael@0 675 const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS);
michael@0 676 const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS);
michael@0 677 const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS);
michael@0 678 const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS);
michael@0 679 const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS);
michael@0 680 const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS);
michael@0 681 const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS);
michael@0 682 const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS);
michael@0 683 const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS);
michael@0 684 const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS);
michael@0 685 const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS);
michael@0 686 const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS);
michael@0 687 const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS);
michael@0 688 const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS);
michael@0 689 // Combine
michael@0 690 step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7);
michael@0 691 step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7);
michael@0 692 step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7);
michael@0 693 step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7);
michael@0 694 step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7);
michael@0 695 step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7);
michael@0 696 step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7);
michael@0 697 step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7);
michael@0 698 }
michael@0 699 // Stage 5
michael@0 700 {
michael@0 701 step2[4] = _mm_add_epi16(step1[5], step3[4]);
michael@0 702 step2[5] = _mm_sub_epi16(step3[4], step1[5]);
michael@0 703 step2[6] = _mm_sub_epi16(step3[7], step1[6]);
michael@0 704 step2[7] = _mm_add_epi16(step1[6], step3[7]);
michael@0 705 }
michael@0 706 {
michael@0 707 const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]);
michael@0 708 const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]);
michael@0 709 const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]);
michael@0 710 const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]);
michael@0 711 const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16);
michael@0 712 const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16);
michael@0 713 const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16);
michael@0 714 const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16);
michael@0 715 const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08);
michael@0 716 const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08);
michael@0 717 const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24);
michael@0 718 const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24);
michael@0 719 // dct_const_round_shift
michael@0 720 const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING);
michael@0 721 const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING);
michael@0 722 const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING);
michael@0 723 const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING);
michael@0 724 const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING);
michael@0 725 const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING);
michael@0 726 const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING);
michael@0 727 const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING);
michael@0 728 const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS);
michael@0 729 const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS);
michael@0 730 const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS);
michael@0 731 const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS);
michael@0 732 const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS);
michael@0 733 const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS);
michael@0 734 const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS);
michael@0 735 const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS);
michael@0 736 // Combine
michael@0 737 out[ 0] = _mm_packs_epi32(out_00_6, out_00_7);
michael@0 738 out[16] = _mm_packs_epi32(out_16_6, out_16_7);
michael@0 739 out[ 8] = _mm_packs_epi32(out_08_6, out_08_7);
michael@0 740 out[24] = _mm_packs_epi32(out_24_6, out_24_7);
michael@0 741 }
michael@0 742 {
michael@0 743 const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]);
michael@0 744 const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]);
michael@0 745 const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]);
michael@0 746 const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]);
michael@0 747 const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24);
michael@0 748 const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24);
michael@0 749 const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08);
michael@0 750 const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08);
michael@0 751 const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24);
michael@0 752 const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24);
michael@0 753 const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08);
michael@0 754 const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08);
michael@0 755 // dct_const_round_shift
michael@0 756 const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING);
michael@0 757 const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING);
michael@0 758 const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING);
michael@0 759 const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING);
michael@0 760 const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING);
michael@0 761 const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING);
michael@0 762 const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING);
michael@0 763 const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING);
michael@0 764 const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS);
michael@0 765 const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS);
michael@0 766 const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS);
michael@0 767 const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS);
michael@0 768 const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS);
michael@0 769 const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS);
michael@0 770 const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS);
michael@0 771 const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS);
michael@0 772 // Combine
michael@0 773 step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7);
michael@0 774 step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7);
michael@0 775 step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7);
michael@0 776 step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7);
michael@0 777 }
michael@0 778 {
michael@0 779 step2[16] = _mm_add_epi16(step1[19], step3[16]);
michael@0 780 step2[17] = _mm_add_epi16(step1[18], step3[17]);
michael@0 781 step2[18] = _mm_sub_epi16(step3[17], step1[18]);
michael@0 782 step2[19] = _mm_sub_epi16(step3[16], step1[19]);
michael@0 783 step2[20] = _mm_sub_epi16(step3[23], step1[20]);
michael@0 784 step2[21] = _mm_sub_epi16(step3[22], step1[21]);
michael@0 785 step2[22] = _mm_add_epi16(step1[21], step3[22]);
michael@0 786 step2[23] = _mm_add_epi16(step1[20], step3[23]);
michael@0 787 step2[24] = _mm_add_epi16(step1[27], step3[24]);
michael@0 788 step2[25] = _mm_add_epi16(step1[26], step3[25]);
michael@0 789 step2[26] = _mm_sub_epi16(step3[25], step1[26]);
michael@0 790 step2[27] = _mm_sub_epi16(step3[24], step1[27]);
michael@0 791 step2[28] = _mm_sub_epi16(step3[31], step1[28]);
michael@0 792 step2[29] = _mm_sub_epi16(step3[30], step1[29]);
michael@0 793 step2[30] = _mm_add_epi16(step1[29], step3[30]);
michael@0 794 step2[31] = _mm_add_epi16(step1[28], step3[31]);
michael@0 795 }
michael@0 796 // Stage 6
michael@0 797 {
michael@0 798 const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
michael@0 799 const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
michael@0 800 const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
michael@0 801 const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
michael@0 802 const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]);
michael@0 803 const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]);
michael@0 804 const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]);
michael@0 805 const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]);
michael@0 806 const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04);
michael@0 807 const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04);
michael@0 808 const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20);
michael@0 809 const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20);
michael@0 810 const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12);
michael@0 811 const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12);
michael@0 812 const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28);
michael@0 813 const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28);
michael@0 814 // dct_const_round_shift
michael@0 815 const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING);
michael@0 816 const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING);
michael@0 817 const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING);
michael@0 818 const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING);
michael@0 819 const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING);
michael@0 820 const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING);
michael@0 821 const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING);
michael@0 822 const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING);
michael@0 823 const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS);
michael@0 824 const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS);
michael@0 825 const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS);
michael@0 826 const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS);
michael@0 827 const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS);
michael@0 828 const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS);
michael@0 829 const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS);
michael@0 830 const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS);
michael@0 831 // Combine
michael@0 832 out[ 4] = _mm_packs_epi32(out_04_6, out_04_7);
michael@0 833 out[20] = _mm_packs_epi32(out_20_6, out_20_7);
michael@0 834 out[12] = _mm_packs_epi32(out_12_6, out_12_7);
michael@0 835 out[28] = _mm_packs_epi32(out_28_6, out_28_7);
michael@0 836 }
michael@0 837 {
michael@0 838 step3[ 8] = _mm_add_epi16(step2[ 9], step1[ 8]);
michael@0 839 step3[ 9] = _mm_sub_epi16(step1[ 8], step2[ 9]);
michael@0 840 step3[10] = _mm_sub_epi16(step1[11], step2[10]);
michael@0 841 step3[11] = _mm_add_epi16(step2[10], step1[11]);
michael@0 842 step3[12] = _mm_add_epi16(step2[13], step1[12]);
michael@0 843 step3[13] = _mm_sub_epi16(step1[12], step2[13]);
michael@0 844 step3[14] = _mm_sub_epi16(step1[15], step2[14]);
michael@0 845 step3[15] = _mm_add_epi16(step2[14], step1[15]);
michael@0 846 }
michael@0 847 {
michael@0 848 const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]);
michael@0 849 const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]);
michael@0 850 const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]);
michael@0 851 const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]);
michael@0 852 const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]);
michael@0 853 const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]);
michael@0 854 const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]);
michael@0 855 const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]);
michael@0 856 const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28);
michael@0 857 const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28);
michael@0 858 const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04);
michael@0 859 const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04);
michael@0 860 const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12);
michael@0 861 const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12);
michael@0 862 const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20);
michael@0 863 const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20);
michael@0 864 const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12);
michael@0 865 const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12);
michael@0 866 const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20);
michael@0 867 const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20);
michael@0 868 const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28);
michael@0 869 const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28);
michael@0 870 const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04);
michael@0 871 const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04);
michael@0 872 // dct_const_round_shift
michael@0 873 const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING);
michael@0 874 const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING);
michael@0 875 const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING);
michael@0 876 const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING);
michael@0 877 const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING);
michael@0 878 const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING);
michael@0 879 const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING);
michael@0 880 const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING);
michael@0 881 const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS);
michael@0 882 const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS);
michael@0 883 const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS);
michael@0 884 const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS);
michael@0 885 const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS);
michael@0 886 const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS);
michael@0 887 const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS);
michael@0 888 const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS);
michael@0 889 const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING);
michael@0 890 const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING);
michael@0 891 const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING);
michael@0 892 const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING);
michael@0 893 const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING);
michael@0 894 const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING);
michael@0 895 const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING);
michael@0 896 const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING);
michael@0 897 const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS);
michael@0 898 const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS);
michael@0 899 const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS);
michael@0 900 const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS);
michael@0 901 const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS);
michael@0 902 const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS);
michael@0 903 const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS);
michael@0 904 const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS);
michael@0 905 // Combine
michael@0 906 step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7);
michael@0 907 step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7);
michael@0 908 step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7);
michael@0 909 step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7);
michael@0 910 // Combine
michael@0 911 step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7);
michael@0 912 step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7);
michael@0 913 step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7);
michael@0 914 step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7);
michael@0 915 }
michael@0 916 // Stage 7
michael@0 917 {
michael@0 918 const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]);
michael@0 919 const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]);
michael@0 920 const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]);
michael@0 921 const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]);
michael@0 922 const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]);
michael@0 923 const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]);
michael@0 924 const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]);
michael@0 925 const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]);
michael@0 926 const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02);
michael@0 927 const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02);
michael@0 928 const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18);
michael@0 929 const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18);
michael@0 930 const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10);
michael@0 931 const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10);
michael@0 932 const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26);
michael@0 933 const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26);
michael@0 934 const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06);
michael@0 935 const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06);
michael@0 936 const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22);
michael@0 937 const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22);
michael@0 938 const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14);
michael@0 939 const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14);
michael@0 940 const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30);
michael@0 941 const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30);
michael@0 942 // dct_const_round_shift
michael@0 943 const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING);
michael@0 944 const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING);
michael@0 945 const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING);
michael@0 946 const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING);
michael@0 947 const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING);
michael@0 948 const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING);
michael@0 949 const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING);
michael@0 950 const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING);
michael@0 951 const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING);
michael@0 952 const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING);
michael@0 953 const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING);
michael@0 954 const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING);
michael@0 955 const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING);
michael@0 956 const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING);
michael@0 957 const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING);
michael@0 958 const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING);
michael@0 959 const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS);
michael@0 960 const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS);
michael@0 961 const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS);
michael@0 962 const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS);
michael@0 963 const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS);
michael@0 964 const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS);
michael@0 965 const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS);
michael@0 966 const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS);
michael@0 967 const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS);
michael@0 968 const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS);
michael@0 969 const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS);
michael@0 970 const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS);
michael@0 971 const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS);
michael@0 972 const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS);
michael@0 973 const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS);
michael@0 974 const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS);
michael@0 975 // Combine
michael@0 976 out[ 2] = _mm_packs_epi32(out_02_6, out_02_7);
michael@0 977 out[18] = _mm_packs_epi32(out_18_6, out_18_7);
michael@0 978 out[10] = _mm_packs_epi32(out_10_6, out_10_7);
michael@0 979 out[26] = _mm_packs_epi32(out_26_6, out_26_7);
michael@0 980 out[ 6] = _mm_packs_epi32(out_06_6, out_06_7);
michael@0 981 out[22] = _mm_packs_epi32(out_22_6, out_22_7);
michael@0 982 out[14] = _mm_packs_epi32(out_14_6, out_14_7);
michael@0 983 out[30] = _mm_packs_epi32(out_30_6, out_30_7);
michael@0 984 }
michael@0 985 {
michael@0 986 step1[16] = _mm_add_epi16(step3[17], step2[16]);
michael@0 987 step1[17] = _mm_sub_epi16(step2[16], step3[17]);
michael@0 988 step1[18] = _mm_sub_epi16(step2[19], step3[18]);
michael@0 989 step1[19] = _mm_add_epi16(step3[18], step2[19]);
michael@0 990 step1[20] = _mm_add_epi16(step3[21], step2[20]);
michael@0 991 step1[21] = _mm_sub_epi16(step2[20], step3[21]);
michael@0 992 step1[22] = _mm_sub_epi16(step2[23], step3[22]);
michael@0 993 step1[23] = _mm_add_epi16(step3[22], step2[23]);
michael@0 994 step1[24] = _mm_add_epi16(step3[25], step2[24]);
michael@0 995 step1[25] = _mm_sub_epi16(step2[24], step3[25]);
michael@0 996 step1[26] = _mm_sub_epi16(step2[27], step3[26]);
michael@0 997 step1[27] = _mm_add_epi16(step3[26], step2[27]);
michael@0 998 step1[28] = _mm_add_epi16(step3[29], step2[28]);
michael@0 999 step1[29] = _mm_sub_epi16(step2[28], step3[29]);
michael@0 1000 step1[30] = _mm_sub_epi16(step2[31], step3[30]);
michael@0 1001 step1[31] = _mm_add_epi16(step3[30], step2[31]);
michael@0 1002 }
michael@0 1003 // Final stage --- outputs indices are bit-reversed.
michael@0 1004 {
michael@0 1005 const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]);
michael@0 1006 const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]);
michael@0 1007 const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]);
michael@0 1008 const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]);
michael@0 1009 const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]);
michael@0 1010 const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]);
michael@0 1011 const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]);
michael@0 1012 const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]);
michael@0 1013 const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01);
michael@0 1014 const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01);
michael@0 1015 const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17);
michael@0 1016 const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17);
michael@0 1017 const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09);
michael@0 1018 const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09);
michael@0 1019 const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25);
michael@0 1020 const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25);
michael@0 1021 const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07);
michael@0 1022 const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07);
michael@0 1023 const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23);
michael@0 1024 const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23);
michael@0 1025 const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15);
michael@0 1026 const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15);
michael@0 1027 const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31);
michael@0 1028 const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31);
michael@0 1029 // dct_const_round_shift
michael@0 1030 const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING);
michael@0 1031 const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING);
michael@0 1032 const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING);
michael@0 1033 const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING);
michael@0 1034 const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING);
michael@0 1035 const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING);
michael@0 1036 const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING);
michael@0 1037 const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING);
michael@0 1038 const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING);
michael@0 1039 const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING);
michael@0 1040 const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING);
michael@0 1041 const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING);
michael@0 1042 const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING);
michael@0 1043 const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING);
michael@0 1044 const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING);
michael@0 1045 const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING);
michael@0 1046 const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS);
michael@0 1047 const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS);
michael@0 1048 const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS);
michael@0 1049 const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS);
michael@0 1050 const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS);
michael@0 1051 const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS);
michael@0 1052 const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS);
michael@0 1053 const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS);
michael@0 1054 const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS);
michael@0 1055 const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS);
michael@0 1056 const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS);
michael@0 1057 const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS);
michael@0 1058 const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS);
michael@0 1059 const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS);
michael@0 1060 const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS);
michael@0 1061 const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS);
michael@0 1062 // Combine
michael@0 1063 out[ 1] = _mm_packs_epi32(out_01_6, out_01_7);
michael@0 1064 out[17] = _mm_packs_epi32(out_17_6, out_17_7);
michael@0 1065 out[ 9] = _mm_packs_epi32(out_09_6, out_09_7);
michael@0 1066 out[25] = _mm_packs_epi32(out_25_6, out_25_7);
michael@0 1067 out[ 7] = _mm_packs_epi32(out_07_6, out_07_7);
michael@0 1068 out[23] = _mm_packs_epi32(out_23_6, out_23_7);
michael@0 1069 out[15] = _mm_packs_epi32(out_15_6, out_15_7);
michael@0 1070 out[31] = _mm_packs_epi32(out_31_6, out_31_7);
michael@0 1071 }
michael@0 1072 {
michael@0 1073 const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]);
michael@0 1074 const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]);
michael@0 1075 const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]);
michael@0 1076 const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]);
michael@0 1077 const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]);
michael@0 1078 const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]);
michael@0 1079 const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]);
michael@0 1080 const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]);
michael@0 1081 const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05);
michael@0 1082 const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05);
michael@0 1083 const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21);
michael@0 1084 const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21);
michael@0 1085 const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13);
michael@0 1086 const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13);
michael@0 1087 const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29);
michael@0 1088 const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29);
michael@0 1089 const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03);
michael@0 1090 const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03);
michael@0 1091 const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19);
michael@0 1092 const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19);
michael@0 1093 const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11);
michael@0 1094 const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11);
michael@0 1095 const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27);
michael@0 1096 const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27);
michael@0 1097 // dct_const_round_shift
michael@0 1098 const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING);
michael@0 1099 const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING);
michael@0 1100 const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING);
michael@0 1101 const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING);
michael@0 1102 const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING);
michael@0 1103 const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING);
michael@0 1104 const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING);
michael@0 1105 const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING);
michael@0 1106 const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING);
michael@0 1107 const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING);
michael@0 1108 const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING);
michael@0 1109 const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING);
michael@0 1110 const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING);
michael@0 1111 const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING);
michael@0 1112 const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING);
michael@0 1113 const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING);
michael@0 1114 const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS);
michael@0 1115 const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS);
michael@0 1116 const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS);
michael@0 1117 const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS);
michael@0 1118 const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS);
michael@0 1119 const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS);
michael@0 1120 const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS);
michael@0 1121 const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS);
michael@0 1122 const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS);
michael@0 1123 const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS);
michael@0 1124 const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS);
michael@0 1125 const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS);
michael@0 1126 const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS);
michael@0 1127 const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS);
michael@0 1128 const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS);
michael@0 1129 const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS);
michael@0 1130 // Combine
michael@0 1131 out[ 5] = _mm_packs_epi32(out_05_6, out_05_7);
michael@0 1132 out[21] = _mm_packs_epi32(out_21_6, out_21_7);
michael@0 1133 out[13] = _mm_packs_epi32(out_13_6, out_13_7);
michael@0 1134 out[29] = _mm_packs_epi32(out_29_6, out_29_7);
michael@0 1135 out[ 3] = _mm_packs_epi32(out_03_6, out_03_7);
michael@0 1136 out[19] = _mm_packs_epi32(out_19_6, out_19_7);
michael@0 1137 out[11] = _mm_packs_epi32(out_11_6, out_11_7);
michael@0 1138 out[27] = _mm_packs_epi32(out_27_6, out_27_7);
michael@0 1139 }
michael@0 1140 #if FDCT32x32_HIGH_PRECISION
michael@0 1141 } else {
michael@0 1142 __m128i lstep1[64], lstep2[64], lstep3[64];
michael@0 1143 __m128i u[32], v[32], sign[16];
michael@0 1144 const __m128i K32One = _mm_set_epi32(1, 1, 1, 1);
michael@0 1145 // start using 32-bit operations
michael@0 1146 // stage 3
michael@0 1147 {
michael@0 1148 // expanding to 32-bit length priori to addition operations
michael@0 1149 lstep2[ 0] = _mm_unpacklo_epi16(step2[ 0], kZero);
michael@0 1150 lstep2[ 1] = _mm_unpackhi_epi16(step2[ 0], kZero);
michael@0 1151 lstep2[ 2] = _mm_unpacklo_epi16(step2[ 1], kZero);
michael@0 1152 lstep2[ 3] = _mm_unpackhi_epi16(step2[ 1], kZero);
michael@0 1153 lstep2[ 4] = _mm_unpacklo_epi16(step2[ 2], kZero);
michael@0 1154 lstep2[ 5] = _mm_unpackhi_epi16(step2[ 2], kZero);
michael@0 1155 lstep2[ 6] = _mm_unpacklo_epi16(step2[ 3], kZero);
michael@0 1156 lstep2[ 7] = _mm_unpackhi_epi16(step2[ 3], kZero);
michael@0 1157 lstep2[ 8] = _mm_unpacklo_epi16(step2[ 4], kZero);
michael@0 1158 lstep2[ 9] = _mm_unpackhi_epi16(step2[ 4], kZero);
michael@0 1159 lstep2[10] = _mm_unpacklo_epi16(step2[ 5], kZero);
michael@0 1160 lstep2[11] = _mm_unpackhi_epi16(step2[ 5], kZero);
michael@0 1161 lstep2[12] = _mm_unpacklo_epi16(step2[ 6], kZero);
michael@0 1162 lstep2[13] = _mm_unpackhi_epi16(step2[ 6], kZero);
michael@0 1163 lstep2[14] = _mm_unpacklo_epi16(step2[ 7], kZero);
michael@0 1164 lstep2[15] = _mm_unpackhi_epi16(step2[ 7], kZero);
michael@0 1165 lstep2[ 0] = _mm_madd_epi16(lstep2[ 0], kOne);
michael@0 1166 lstep2[ 1] = _mm_madd_epi16(lstep2[ 1], kOne);
michael@0 1167 lstep2[ 2] = _mm_madd_epi16(lstep2[ 2], kOne);
michael@0 1168 lstep2[ 3] = _mm_madd_epi16(lstep2[ 3], kOne);
michael@0 1169 lstep2[ 4] = _mm_madd_epi16(lstep2[ 4], kOne);
michael@0 1170 lstep2[ 5] = _mm_madd_epi16(lstep2[ 5], kOne);
michael@0 1171 lstep2[ 6] = _mm_madd_epi16(lstep2[ 6], kOne);
michael@0 1172 lstep2[ 7] = _mm_madd_epi16(lstep2[ 7], kOne);
michael@0 1173 lstep2[ 8] = _mm_madd_epi16(lstep2[ 8], kOne);
michael@0 1174 lstep2[ 9] = _mm_madd_epi16(lstep2[ 9], kOne);
michael@0 1175 lstep2[10] = _mm_madd_epi16(lstep2[10], kOne);
michael@0 1176 lstep2[11] = _mm_madd_epi16(lstep2[11], kOne);
michael@0 1177 lstep2[12] = _mm_madd_epi16(lstep2[12], kOne);
michael@0 1178 lstep2[13] = _mm_madd_epi16(lstep2[13], kOne);
michael@0 1179 lstep2[14] = _mm_madd_epi16(lstep2[14], kOne);
michael@0 1180 lstep2[15] = _mm_madd_epi16(lstep2[15], kOne);
michael@0 1181
michael@0 1182 lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]);
michael@0 1183 lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]);
michael@0 1184 lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]);
michael@0 1185 lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]);
michael@0 1186 lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]);
michael@0 1187 lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]);
michael@0 1188 lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]);
michael@0 1189 lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]);
michael@0 1190 lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]);
michael@0 1191 lstep3[ 9] = _mm_sub_epi32(lstep2[ 7], lstep2[ 9]);
michael@0 1192 lstep3[10] = _mm_sub_epi32(lstep2[ 4], lstep2[10]);
michael@0 1193 lstep3[11] = _mm_sub_epi32(lstep2[ 5], lstep2[11]);
michael@0 1194 lstep3[12] = _mm_sub_epi32(lstep2[ 2], lstep2[12]);
michael@0 1195 lstep3[13] = _mm_sub_epi32(lstep2[ 3], lstep2[13]);
michael@0 1196 lstep3[14] = _mm_sub_epi32(lstep2[ 0], lstep2[14]);
michael@0 1197 lstep3[15] = _mm_sub_epi32(lstep2[ 1], lstep2[15]);
michael@0 1198 }
michael@0 1199 {
michael@0 1200 const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]);
michael@0 1201 const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]);
michael@0 1202 const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]);
michael@0 1203 const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]);
michael@0 1204 const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16);
michael@0 1205 const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16);
michael@0 1206 const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16);
michael@0 1207 const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16);
michael@0 1208 const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16);
michael@0 1209 const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16);
michael@0 1210 const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16);
michael@0 1211 const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16);
michael@0 1212 // dct_const_round_shift
michael@0 1213 const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING);
michael@0 1214 const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING);
michael@0 1215 const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING);
michael@0 1216 const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING);
michael@0 1217 const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING);
michael@0 1218 const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING);
michael@0 1219 const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING);
michael@0 1220 const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING);
michael@0 1221 lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS);
michael@0 1222 lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS);
michael@0 1223 lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS);
michael@0 1224 lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS);
michael@0 1225 lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS);
michael@0 1226 lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS);
michael@0 1227 lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS);
michael@0 1228 lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS);
michael@0 1229 }
michael@0 1230 {
michael@0 1231 lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero);
michael@0 1232 lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero);
michael@0 1233 lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero);
michael@0 1234 lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero);
michael@0 1235 lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero);
michael@0 1236 lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero);
michael@0 1237 lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero);
michael@0 1238 lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero);
michael@0 1239 lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero);
michael@0 1240 lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero);
michael@0 1241 lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero);
michael@0 1242 lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero);
michael@0 1243 lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero);
michael@0 1244 lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero);
michael@0 1245 lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero);
michael@0 1246 lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero);
michael@0 1247 lstep2[40] = _mm_madd_epi16(lstep2[40], kOne);
michael@0 1248 lstep2[41] = _mm_madd_epi16(lstep2[41], kOne);
michael@0 1249 lstep2[42] = _mm_madd_epi16(lstep2[42], kOne);
michael@0 1250 lstep2[43] = _mm_madd_epi16(lstep2[43], kOne);
michael@0 1251 lstep2[44] = _mm_madd_epi16(lstep2[44], kOne);
michael@0 1252 lstep2[45] = _mm_madd_epi16(lstep2[45], kOne);
michael@0 1253 lstep2[46] = _mm_madd_epi16(lstep2[46], kOne);
michael@0 1254 lstep2[47] = _mm_madd_epi16(lstep2[47], kOne);
michael@0 1255 lstep2[48] = _mm_madd_epi16(lstep2[48], kOne);
michael@0 1256 lstep2[49] = _mm_madd_epi16(lstep2[49], kOne);
michael@0 1257 lstep2[50] = _mm_madd_epi16(lstep2[50], kOne);
michael@0 1258 lstep2[51] = _mm_madd_epi16(lstep2[51], kOne);
michael@0 1259 lstep2[52] = _mm_madd_epi16(lstep2[52], kOne);
michael@0 1260 lstep2[53] = _mm_madd_epi16(lstep2[53], kOne);
michael@0 1261 lstep2[54] = _mm_madd_epi16(lstep2[54], kOne);
michael@0 1262 lstep2[55] = _mm_madd_epi16(lstep2[55], kOne);
michael@0 1263
michael@0 1264 lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero);
michael@0 1265 lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero);
michael@0 1266 lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero);
michael@0 1267 lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero);
michael@0 1268 lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero);
michael@0 1269 lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero);
michael@0 1270 lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero);
michael@0 1271 lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero);
michael@0 1272 lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero);
michael@0 1273 lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero);
michael@0 1274 lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero);
michael@0 1275 lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero);
michael@0 1276 lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero);
michael@0 1277 lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero);
michael@0 1278 lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero);
michael@0 1279 lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero);
michael@0 1280 lstep1[32] = _mm_madd_epi16(lstep1[32], kOne);
michael@0 1281 lstep1[33] = _mm_madd_epi16(lstep1[33], kOne);
michael@0 1282 lstep1[34] = _mm_madd_epi16(lstep1[34], kOne);
michael@0 1283 lstep1[35] = _mm_madd_epi16(lstep1[35], kOne);
michael@0 1284 lstep1[36] = _mm_madd_epi16(lstep1[36], kOne);
michael@0 1285 lstep1[37] = _mm_madd_epi16(lstep1[37], kOne);
michael@0 1286 lstep1[38] = _mm_madd_epi16(lstep1[38], kOne);
michael@0 1287 lstep1[39] = _mm_madd_epi16(lstep1[39], kOne);
michael@0 1288 lstep1[56] = _mm_madd_epi16(lstep1[56], kOne);
michael@0 1289 lstep1[57] = _mm_madd_epi16(lstep1[57], kOne);
michael@0 1290 lstep1[58] = _mm_madd_epi16(lstep1[58], kOne);
michael@0 1291 lstep1[59] = _mm_madd_epi16(lstep1[59], kOne);
michael@0 1292 lstep1[60] = _mm_madd_epi16(lstep1[60], kOne);
michael@0 1293 lstep1[61] = _mm_madd_epi16(lstep1[61], kOne);
michael@0 1294 lstep1[62] = _mm_madd_epi16(lstep1[62], kOne);
michael@0 1295 lstep1[63] = _mm_madd_epi16(lstep1[63], kOne);
michael@0 1296
michael@0 1297 lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]);
michael@0 1298 lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]);
michael@0 1299
michael@0 1300 lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]);
michael@0 1301 lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]);
michael@0 1302 lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]);
michael@0 1303 lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]);
michael@0 1304 lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]);
michael@0 1305 lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]);
michael@0 1306 lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]);
michael@0 1307 lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]);
michael@0 1308 lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]);
michael@0 1309 lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]);
michael@0 1310 lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]);
michael@0 1311 lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]);
michael@0 1312 lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]);
michael@0 1313 lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]);
michael@0 1314 lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]);
michael@0 1315 lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]);
michael@0 1316 lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]);
michael@0 1317 lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]);
michael@0 1318 lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]);
michael@0 1319 lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]);
michael@0 1320 lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]);
michael@0 1321 lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]);
michael@0 1322 lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]);
michael@0 1323 lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]);
michael@0 1324 lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]);
michael@0 1325 lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]);
michael@0 1326 lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]);
michael@0 1327 lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]);
michael@0 1328 lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]);
michael@0 1329 lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]);
michael@0 1330 }
michael@0 1331
michael@0 1332 // stage 4
michael@0 1333 {
michael@0 1334 // expanding to 32-bit length priori to addition operations
michael@0 1335 lstep2[16] = _mm_unpacklo_epi16(step2[ 8], kZero);
michael@0 1336 lstep2[17] = _mm_unpackhi_epi16(step2[ 8], kZero);
michael@0 1337 lstep2[18] = _mm_unpacklo_epi16(step2[ 9], kZero);
michael@0 1338 lstep2[19] = _mm_unpackhi_epi16(step2[ 9], kZero);
michael@0 1339 lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero);
michael@0 1340 lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero);
michael@0 1341 lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero);
michael@0 1342 lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero);
michael@0 1343 lstep2[16] = _mm_madd_epi16(lstep2[16], kOne);
michael@0 1344 lstep2[17] = _mm_madd_epi16(lstep2[17], kOne);
michael@0 1345 lstep2[18] = _mm_madd_epi16(lstep2[18], kOne);
michael@0 1346 lstep2[19] = _mm_madd_epi16(lstep2[19], kOne);
michael@0 1347 lstep2[28] = _mm_madd_epi16(lstep2[28], kOne);
michael@0 1348 lstep2[29] = _mm_madd_epi16(lstep2[29], kOne);
michael@0 1349 lstep2[30] = _mm_madd_epi16(lstep2[30], kOne);
michael@0 1350 lstep2[31] = _mm_madd_epi16(lstep2[31], kOne);
michael@0 1351
michael@0 1352 lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]);
michael@0 1353 lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]);
michael@0 1354 lstep1[ 2] = _mm_add_epi32(lstep3[ 4], lstep3[ 2]);
michael@0 1355 lstep1[ 3] = _mm_add_epi32(lstep3[ 5], lstep3[ 3]);
michael@0 1356 lstep1[ 4] = _mm_sub_epi32(lstep3[ 2], lstep3[ 4]);
michael@0 1357 lstep1[ 5] = _mm_sub_epi32(lstep3[ 3], lstep3[ 5]);
michael@0 1358 lstep1[ 6] = _mm_sub_epi32(lstep3[ 0], lstep3[ 6]);
michael@0 1359 lstep1[ 7] = _mm_sub_epi32(lstep3[ 1], lstep3[ 7]);
michael@0 1360 lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]);
michael@0 1361 lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]);
michael@0 1362 lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]);
michael@0 1363 lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]);
michael@0 1364 lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]);
michael@0 1365 lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]);
michael@0 1366 lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]);
michael@0 1367 lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]);
michael@0 1368 lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]);
michael@0 1369 lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]);
michael@0 1370 lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]);
michael@0 1371 lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]);
michael@0 1372 lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]);
michael@0 1373 lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]);
michael@0 1374 lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]);
michael@0 1375 lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]);
michael@0 1376 }
michael@0 1377 {
michael@0 1378 // to be continued...
michael@0 1379 //
michael@0 1380 const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
michael@0 1381 const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
michael@0 1382
michael@0 1383 u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]);
michael@0 1384 u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]);
michael@0 1385 u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]);
michael@0 1386 u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]);
michael@0 1387
michael@0 1388 // TODO(jingning): manually inline k_madd_epi32_ to further hide
michael@0 1389 // instruction latency.
michael@0 1390 v[ 0] = k_madd_epi32(u[0], k32_p16_m16);
michael@0 1391 v[ 1] = k_madd_epi32(u[1], k32_p16_m16);
michael@0 1392 v[ 2] = k_madd_epi32(u[2], k32_p16_m16);
michael@0 1393 v[ 3] = k_madd_epi32(u[3], k32_p16_m16);
michael@0 1394 v[ 4] = k_madd_epi32(u[0], k32_p16_p16);
michael@0 1395 v[ 5] = k_madd_epi32(u[1], k32_p16_p16);
michael@0 1396 v[ 6] = k_madd_epi32(u[2], k32_p16_p16);
michael@0 1397 v[ 7] = k_madd_epi32(u[3], k32_p16_p16);
michael@0 1398
michael@0 1399 u[0] = k_packs_epi64(v[0], v[1]);
michael@0 1400 u[1] = k_packs_epi64(v[2], v[3]);
michael@0 1401 u[2] = k_packs_epi64(v[4], v[5]);
michael@0 1402 u[3] = k_packs_epi64(v[6], v[7]);
michael@0 1403
michael@0 1404 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
michael@0 1405 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
michael@0 1406 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
michael@0 1407 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
michael@0 1408
michael@0 1409 lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
michael@0 1410 lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
michael@0 1411 lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
michael@0 1412 lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
michael@0 1413 }
michael@0 1414 {
michael@0 1415 const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
michael@0 1416 const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
michael@0 1417 const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
michael@0 1418
michael@0 1419 u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]);
michael@0 1420 u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]);
michael@0 1421 u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]);
michael@0 1422 u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]);
michael@0 1423 u[ 4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]);
michael@0 1424 u[ 5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]);
michael@0 1425 u[ 6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]);
michael@0 1426 u[ 7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]);
michael@0 1427 u[ 8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]);
michael@0 1428 u[ 9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]);
michael@0 1429 u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]);
michael@0 1430 u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]);
michael@0 1431 u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]);
michael@0 1432 u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]);
michael@0 1433 u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]);
michael@0 1434 u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]);
michael@0 1435
michael@0 1436 v[ 0] = k_madd_epi32(u[ 0], k32_m08_p24);
michael@0 1437 v[ 1] = k_madd_epi32(u[ 1], k32_m08_p24);
michael@0 1438 v[ 2] = k_madd_epi32(u[ 2], k32_m08_p24);
michael@0 1439 v[ 3] = k_madd_epi32(u[ 3], k32_m08_p24);
michael@0 1440 v[ 4] = k_madd_epi32(u[ 4], k32_m08_p24);
michael@0 1441 v[ 5] = k_madd_epi32(u[ 5], k32_m08_p24);
michael@0 1442 v[ 6] = k_madd_epi32(u[ 6], k32_m08_p24);
michael@0 1443 v[ 7] = k_madd_epi32(u[ 7], k32_m08_p24);
michael@0 1444 v[ 8] = k_madd_epi32(u[ 8], k32_m24_m08);
michael@0 1445 v[ 9] = k_madd_epi32(u[ 9], k32_m24_m08);
michael@0 1446 v[10] = k_madd_epi32(u[10], k32_m24_m08);
michael@0 1447 v[11] = k_madd_epi32(u[11], k32_m24_m08);
michael@0 1448 v[12] = k_madd_epi32(u[12], k32_m24_m08);
michael@0 1449 v[13] = k_madd_epi32(u[13], k32_m24_m08);
michael@0 1450 v[14] = k_madd_epi32(u[14], k32_m24_m08);
michael@0 1451 v[15] = k_madd_epi32(u[15], k32_m24_m08);
michael@0 1452 v[16] = k_madd_epi32(u[12], k32_m08_p24);
michael@0 1453 v[17] = k_madd_epi32(u[13], k32_m08_p24);
michael@0 1454 v[18] = k_madd_epi32(u[14], k32_m08_p24);
michael@0 1455 v[19] = k_madd_epi32(u[15], k32_m08_p24);
michael@0 1456 v[20] = k_madd_epi32(u[ 8], k32_m08_p24);
michael@0 1457 v[21] = k_madd_epi32(u[ 9], k32_m08_p24);
michael@0 1458 v[22] = k_madd_epi32(u[10], k32_m08_p24);
michael@0 1459 v[23] = k_madd_epi32(u[11], k32_m08_p24);
michael@0 1460 v[24] = k_madd_epi32(u[ 4], k32_p24_p08);
michael@0 1461 v[25] = k_madd_epi32(u[ 5], k32_p24_p08);
michael@0 1462 v[26] = k_madd_epi32(u[ 6], k32_p24_p08);
michael@0 1463 v[27] = k_madd_epi32(u[ 7], k32_p24_p08);
michael@0 1464 v[28] = k_madd_epi32(u[ 0], k32_p24_p08);
michael@0 1465 v[29] = k_madd_epi32(u[ 1], k32_p24_p08);
michael@0 1466 v[30] = k_madd_epi32(u[ 2], k32_p24_p08);
michael@0 1467 v[31] = k_madd_epi32(u[ 3], k32_p24_p08);
michael@0 1468
michael@0 1469 u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
michael@0 1470 u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
michael@0 1471 u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
michael@0 1472 u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
michael@0 1473 u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
michael@0 1474 u[ 5] = k_packs_epi64(v[10], v[11]);
michael@0 1475 u[ 6] = k_packs_epi64(v[12], v[13]);
michael@0 1476 u[ 7] = k_packs_epi64(v[14], v[15]);
michael@0 1477 u[ 8] = k_packs_epi64(v[16], v[17]);
michael@0 1478 u[ 9] = k_packs_epi64(v[18], v[19]);
michael@0 1479 u[10] = k_packs_epi64(v[20], v[21]);
michael@0 1480 u[11] = k_packs_epi64(v[22], v[23]);
michael@0 1481 u[12] = k_packs_epi64(v[24], v[25]);
michael@0 1482 u[13] = k_packs_epi64(v[26], v[27]);
michael@0 1483 u[14] = k_packs_epi64(v[28], v[29]);
michael@0 1484 u[15] = k_packs_epi64(v[30], v[31]);
michael@0 1485
michael@0 1486 v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
michael@0 1487 v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
michael@0 1488 v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
michael@0 1489 v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
michael@0 1490 v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
michael@0 1491 v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
michael@0 1492 v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
michael@0 1493 v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
michael@0 1494 v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
michael@0 1495 v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
michael@0 1496 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
michael@0 1497 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
michael@0 1498 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
michael@0 1499 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
michael@0 1500 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
michael@0 1501 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
michael@0 1502
michael@0 1503 lstep1[36] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
michael@0 1504 lstep1[37] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
michael@0 1505 lstep1[38] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
michael@0 1506 lstep1[39] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
michael@0 1507 lstep1[40] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
michael@0 1508 lstep1[41] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
michael@0 1509 lstep1[42] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
michael@0 1510 lstep1[43] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
michael@0 1511 lstep1[52] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
michael@0 1512 lstep1[53] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
michael@0 1513 lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
michael@0 1514 lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
michael@0 1515 lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
michael@0 1516 lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
michael@0 1517 lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
michael@0 1518 lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
michael@0 1519 }
michael@0 1520 // stage 5
michael@0 1521 {
michael@0 1522 lstep2[ 8] = _mm_add_epi32(lstep1[10], lstep3[ 8]);
michael@0 1523 lstep2[ 9] = _mm_add_epi32(lstep1[11], lstep3[ 9]);
michael@0 1524 lstep2[10] = _mm_sub_epi32(lstep3[ 8], lstep1[10]);
michael@0 1525 lstep2[11] = _mm_sub_epi32(lstep3[ 9], lstep1[11]);
michael@0 1526 lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]);
michael@0 1527 lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]);
michael@0 1528 lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]);
michael@0 1529 lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]);
michael@0 1530 }
michael@0 1531 {
michael@0 1532 const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64);
michael@0 1533 const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64);
michael@0 1534 const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
michael@0 1535 const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
michael@0 1536
michael@0 1537 u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]);
michael@0 1538 u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]);
michael@0 1539 u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]);
michael@0 1540 u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]);
michael@0 1541 u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]);
michael@0 1542 u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]);
michael@0 1543 u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]);
michael@0 1544 u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]);
michael@0 1545
michael@0 1546 // TODO(jingning): manually inline k_madd_epi32_ to further hide
michael@0 1547 // instruction latency.
michael@0 1548 v[ 0] = k_madd_epi32(u[0], k32_p16_p16);
michael@0 1549 v[ 1] = k_madd_epi32(u[1], k32_p16_p16);
michael@0 1550 v[ 2] = k_madd_epi32(u[2], k32_p16_p16);
michael@0 1551 v[ 3] = k_madd_epi32(u[3], k32_p16_p16);
michael@0 1552 v[ 4] = k_madd_epi32(u[0], k32_p16_m16);
michael@0 1553 v[ 5] = k_madd_epi32(u[1], k32_p16_m16);
michael@0 1554 v[ 6] = k_madd_epi32(u[2], k32_p16_m16);
michael@0 1555 v[ 7] = k_madd_epi32(u[3], k32_p16_m16);
michael@0 1556 v[ 8] = k_madd_epi32(u[4], k32_p24_p08);
michael@0 1557 v[ 9] = k_madd_epi32(u[5], k32_p24_p08);
michael@0 1558 v[10] = k_madd_epi32(u[6], k32_p24_p08);
michael@0 1559 v[11] = k_madd_epi32(u[7], k32_p24_p08);
michael@0 1560 v[12] = k_madd_epi32(u[4], k32_m08_p24);
michael@0 1561 v[13] = k_madd_epi32(u[5], k32_m08_p24);
michael@0 1562 v[14] = k_madd_epi32(u[6], k32_m08_p24);
michael@0 1563 v[15] = k_madd_epi32(u[7], k32_m08_p24);
michael@0 1564
michael@0 1565 u[0] = k_packs_epi64(v[0], v[1]);
michael@0 1566 u[1] = k_packs_epi64(v[2], v[3]);
michael@0 1567 u[2] = k_packs_epi64(v[4], v[5]);
michael@0 1568 u[3] = k_packs_epi64(v[6], v[7]);
michael@0 1569 u[4] = k_packs_epi64(v[8], v[9]);
michael@0 1570 u[5] = k_packs_epi64(v[10], v[11]);
michael@0 1571 u[6] = k_packs_epi64(v[12], v[13]);
michael@0 1572 u[7] = k_packs_epi64(v[14], v[15]);
michael@0 1573
michael@0 1574 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
michael@0 1575 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
michael@0 1576 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
michael@0 1577 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
michael@0 1578 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
michael@0 1579 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
michael@0 1580 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
michael@0 1581 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
michael@0 1582
michael@0 1583 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
michael@0 1584 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
michael@0 1585 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
michael@0 1586 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
michael@0 1587 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
michael@0 1588 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
michael@0 1589 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
michael@0 1590 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
michael@0 1591
michael@0 1592 sign[0] = _mm_cmplt_epi32(u[0], kZero);
michael@0 1593 sign[1] = _mm_cmplt_epi32(u[1], kZero);
michael@0 1594 sign[2] = _mm_cmplt_epi32(u[2], kZero);
michael@0 1595 sign[3] = _mm_cmplt_epi32(u[3], kZero);
michael@0 1596 sign[4] = _mm_cmplt_epi32(u[4], kZero);
michael@0 1597 sign[5] = _mm_cmplt_epi32(u[5], kZero);
michael@0 1598 sign[6] = _mm_cmplt_epi32(u[6], kZero);
michael@0 1599 sign[7] = _mm_cmplt_epi32(u[7], kZero);
michael@0 1600
michael@0 1601 u[0] = _mm_sub_epi32(u[0], sign[0]);
michael@0 1602 u[1] = _mm_sub_epi32(u[1], sign[1]);
michael@0 1603 u[2] = _mm_sub_epi32(u[2], sign[2]);
michael@0 1604 u[3] = _mm_sub_epi32(u[3], sign[3]);
michael@0 1605 u[4] = _mm_sub_epi32(u[4], sign[4]);
michael@0 1606 u[5] = _mm_sub_epi32(u[5], sign[5]);
michael@0 1607 u[6] = _mm_sub_epi32(u[6], sign[6]);
michael@0 1608 u[7] = _mm_sub_epi32(u[7], sign[7]);
michael@0 1609
michael@0 1610 u[0] = _mm_add_epi32(u[0], K32One);
michael@0 1611 u[1] = _mm_add_epi32(u[1], K32One);
michael@0 1612 u[2] = _mm_add_epi32(u[2], K32One);
michael@0 1613 u[3] = _mm_add_epi32(u[3], K32One);
michael@0 1614 u[4] = _mm_add_epi32(u[4], K32One);
michael@0 1615 u[5] = _mm_add_epi32(u[5], K32One);
michael@0 1616 u[6] = _mm_add_epi32(u[6], K32One);
michael@0 1617 u[7] = _mm_add_epi32(u[7], K32One);
michael@0 1618
michael@0 1619 u[0] = _mm_srai_epi32(u[0], 2);
michael@0 1620 u[1] = _mm_srai_epi32(u[1], 2);
michael@0 1621 u[2] = _mm_srai_epi32(u[2], 2);
michael@0 1622 u[3] = _mm_srai_epi32(u[3], 2);
michael@0 1623 u[4] = _mm_srai_epi32(u[4], 2);
michael@0 1624 u[5] = _mm_srai_epi32(u[5], 2);
michael@0 1625 u[6] = _mm_srai_epi32(u[6], 2);
michael@0 1626 u[7] = _mm_srai_epi32(u[7], 2);
michael@0 1627
michael@0 1628 // Combine
michael@0 1629 out[ 0] = _mm_packs_epi32(u[0], u[1]);
michael@0 1630 out[16] = _mm_packs_epi32(u[2], u[3]);
michael@0 1631 out[ 8] = _mm_packs_epi32(u[4], u[5]);
michael@0 1632 out[24] = _mm_packs_epi32(u[6], u[7]);
michael@0 1633 }
michael@0 1634 {
michael@0 1635 const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64);
michael@0 1636 const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64);
michael@0 1637 const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64);
michael@0 1638
michael@0 1639 u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]);
michael@0 1640 u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]);
michael@0 1641 u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]);
michael@0 1642 u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]);
michael@0 1643 u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]);
michael@0 1644 u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]);
michael@0 1645 u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]);
michael@0 1646 u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]);
michael@0 1647
michael@0 1648 v[0] = k_madd_epi32(u[0], k32_m08_p24);
michael@0 1649 v[1] = k_madd_epi32(u[1], k32_m08_p24);
michael@0 1650 v[2] = k_madd_epi32(u[2], k32_m08_p24);
michael@0 1651 v[3] = k_madd_epi32(u[3], k32_m08_p24);
michael@0 1652 v[4] = k_madd_epi32(u[4], k32_m24_m08);
michael@0 1653 v[5] = k_madd_epi32(u[5], k32_m24_m08);
michael@0 1654 v[6] = k_madd_epi32(u[6], k32_m24_m08);
michael@0 1655 v[7] = k_madd_epi32(u[7], k32_m24_m08);
michael@0 1656 v[ 8] = k_madd_epi32(u[4], k32_m08_p24);
michael@0 1657 v[ 9] = k_madd_epi32(u[5], k32_m08_p24);
michael@0 1658 v[10] = k_madd_epi32(u[6], k32_m08_p24);
michael@0 1659 v[11] = k_madd_epi32(u[7], k32_m08_p24);
michael@0 1660 v[12] = k_madd_epi32(u[0], k32_p24_p08);
michael@0 1661 v[13] = k_madd_epi32(u[1], k32_p24_p08);
michael@0 1662 v[14] = k_madd_epi32(u[2], k32_p24_p08);
michael@0 1663 v[15] = k_madd_epi32(u[3], k32_p24_p08);
michael@0 1664
michael@0 1665 u[0] = k_packs_epi64(v[0], v[1]);
michael@0 1666 u[1] = k_packs_epi64(v[2], v[3]);
michael@0 1667 u[2] = k_packs_epi64(v[4], v[5]);
michael@0 1668 u[3] = k_packs_epi64(v[6], v[7]);
michael@0 1669 u[4] = k_packs_epi64(v[8], v[9]);
michael@0 1670 u[5] = k_packs_epi64(v[10], v[11]);
michael@0 1671 u[6] = k_packs_epi64(v[12], v[13]);
michael@0 1672 u[7] = k_packs_epi64(v[14], v[15]);
michael@0 1673
michael@0 1674 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
michael@0 1675 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
michael@0 1676 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
michael@0 1677 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
michael@0 1678 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
michael@0 1679 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
michael@0 1680 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
michael@0 1681 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
michael@0 1682
michael@0 1683 lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
michael@0 1684 lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
michael@0 1685 lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
michael@0 1686 lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
michael@0 1687 lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
michael@0 1688 lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
michael@0 1689 lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
michael@0 1690 lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
michael@0 1691 }
michael@0 1692 {
michael@0 1693 lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]);
michael@0 1694 lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]);
michael@0 1695 lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]);
michael@0 1696 lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]);
michael@0 1697 lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]);
michael@0 1698 lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]);
michael@0 1699 lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]);
michael@0 1700 lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]);
michael@0 1701 lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]);
michael@0 1702 lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]);
michael@0 1703 lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]);
michael@0 1704 lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]);
michael@0 1705 lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]);
michael@0 1706 lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]);
michael@0 1707 lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]);
michael@0 1708 lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]);
michael@0 1709 lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]);
michael@0 1710 lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]);
michael@0 1711 lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]);
michael@0 1712 lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]);
michael@0 1713 lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]);
michael@0 1714 lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]);
michael@0 1715 lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]);
michael@0 1716 lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]);
michael@0 1717 lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]);
michael@0 1718 lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]);
michael@0 1719 lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]);
michael@0 1720 lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]);
michael@0 1721 lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]);
michael@0 1722 lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]);
michael@0 1723 lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]);
michael@0 1724 lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]);
michael@0 1725 }
michael@0 1726 // stage 6
michael@0 1727 {
michael@0 1728 const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
michael@0 1729 const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
michael@0 1730 const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
michael@0 1731 const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
michael@0 1732
michael@0 1733 u[0] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]);
michael@0 1734 u[1] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]);
michael@0 1735 u[2] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]);
michael@0 1736 u[3] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]);
michael@0 1737 u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
michael@0 1738 u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
michael@0 1739 u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
michael@0 1740 u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
michael@0 1741 u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]);
michael@0 1742 u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]);
michael@0 1743 u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]);
michael@0 1744 u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]);
michael@0 1745 u[12] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]);
michael@0 1746 u[13] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]);
michael@0 1747 u[14] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]);
michael@0 1748 u[15] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]);
michael@0 1749
michael@0 1750 v[0] = k_madd_epi32(u[0], k32_p28_p04);
michael@0 1751 v[1] = k_madd_epi32(u[1], k32_p28_p04);
michael@0 1752 v[2] = k_madd_epi32(u[2], k32_p28_p04);
michael@0 1753 v[3] = k_madd_epi32(u[3], k32_p28_p04);
michael@0 1754 v[4] = k_madd_epi32(u[4], k32_p12_p20);
michael@0 1755 v[5] = k_madd_epi32(u[5], k32_p12_p20);
michael@0 1756 v[6] = k_madd_epi32(u[6], k32_p12_p20);
michael@0 1757 v[7] = k_madd_epi32(u[7], k32_p12_p20);
michael@0 1758 v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12);
michael@0 1759 v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12);
michael@0 1760 v[10] = k_madd_epi32(u[10], k32_m20_p12);
michael@0 1761 v[11] = k_madd_epi32(u[11], k32_m20_p12);
michael@0 1762 v[12] = k_madd_epi32(u[12], k32_m04_p28);
michael@0 1763 v[13] = k_madd_epi32(u[13], k32_m04_p28);
michael@0 1764 v[14] = k_madd_epi32(u[14], k32_m04_p28);
michael@0 1765 v[15] = k_madd_epi32(u[15], k32_m04_p28);
michael@0 1766
michael@0 1767 u[0] = k_packs_epi64(v[0], v[1]);
michael@0 1768 u[1] = k_packs_epi64(v[2], v[3]);
michael@0 1769 u[2] = k_packs_epi64(v[4], v[5]);
michael@0 1770 u[3] = k_packs_epi64(v[6], v[7]);
michael@0 1771 u[4] = k_packs_epi64(v[8], v[9]);
michael@0 1772 u[5] = k_packs_epi64(v[10], v[11]);
michael@0 1773 u[6] = k_packs_epi64(v[12], v[13]);
michael@0 1774 u[7] = k_packs_epi64(v[14], v[15]);
michael@0 1775
michael@0 1776 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
michael@0 1777 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
michael@0 1778 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
michael@0 1779 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
michael@0 1780 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
michael@0 1781 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
michael@0 1782 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
michael@0 1783 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
michael@0 1784
michael@0 1785 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
michael@0 1786 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
michael@0 1787 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
michael@0 1788 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
michael@0 1789 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
michael@0 1790 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
michael@0 1791 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
michael@0 1792 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
michael@0 1793
michael@0 1794 sign[0] = _mm_cmplt_epi32(u[0], kZero);
michael@0 1795 sign[1] = _mm_cmplt_epi32(u[1], kZero);
michael@0 1796 sign[2] = _mm_cmplt_epi32(u[2], kZero);
michael@0 1797 sign[3] = _mm_cmplt_epi32(u[3], kZero);
michael@0 1798 sign[4] = _mm_cmplt_epi32(u[4], kZero);
michael@0 1799 sign[5] = _mm_cmplt_epi32(u[5], kZero);
michael@0 1800 sign[6] = _mm_cmplt_epi32(u[6], kZero);
michael@0 1801 sign[7] = _mm_cmplt_epi32(u[7], kZero);
michael@0 1802
michael@0 1803 u[0] = _mm_sub_epi32(u[0], sign[0]);
michael@0 1804 u[1] = _mm_sub_epi32(u[1], sign[1]);
michael@0 1805 u[2] = _mm_sub_epi32(u[2], sign[2]);
michael@0 1806 u[3] = _mm_sub_epi32(u[3], sign[3]);
michael@0 1807 u[4] = _mm_sub_epi32(u[4], sign[4]);
michael@0 1808 u[5] = _mm_sub_epi32(u[5], sign[5]);
michael@0 1809 u[6] = _mm_sub_epi32(u[6], sign[6]);
michael@0 1810 u[7] = _mm_sub_epi32(u[7], sign[7]);
michael@0 1811
michael@0 1812 u[0] = _mm_add_epi32(u[0], K32One);
michael@0 1813 u[1] = _mm_add_epi32(u[1], K32One);
michael@0 1814 u[2] = _mm_add_epi32(u[2], K32One);
michael@0 1815 u[3] = _mm_add_epi32(u[3], K32One);
michael@0 1816 u[4] = _mm_add_epi32(u[4], K32One);
michael@0 1817 u[5] = _mm_add_epi32(u[5], K32One);
michael@0 1818 u[6] = _mm_add_epi32(u[6], K32One);
michael@0 1819 u[7] = _mm_add_epi32(u[7], K32One);
michael@0 1820
michael@0 1821 u[0] = _mm_srai_epi32(u[0], 2);
michael@0 1822 u[1] = _mm_srai_epi32(u[1], 2);
michael@0 1823 u[2] = _mm_srai_epi32(u[2], 2);
michael@0 1824 u[3] = _mm_srai_epi32(u[3], 2);
michael@0 1825 u[4] = _mm_srai_epi32(u[4], 2);
michael@0 1826 u[5] = _mm_srai_epi32(u[5], 2);
michael@0 1827 u[6] = _mm_srai_epi32(u[6], 2);
michael@0 1828 u[7] = _mm_srai_epi32(u[7], 2);
michael@0 1829
michael@0 1830 out[ 4] = _mm_packs_epi32(u[0], u[1]);
michael@0 1831 out[20] = _mm_packs_epi32(u[2], u[3]);
michael@0 1832 out[12] = _mm_packs_epi32(u[4], u[5]);
michael@0 1833 out[28] = _mm_packs_epi32(u[6], u[7]);
michael@0 1834 }
michael@0 1835 {
michael@0 1836 lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]);
michael@0 1837 lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]);
michael@0 1838 lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]);
michael@0 1839 lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]);
michael@0 1840 lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]);
michael@0 1841 lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]);
michael@0 1842 lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]);
michael@0 1843 lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]);
michael@0 1844 lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]);
michael@0 1845 lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]);
michael@0 1846 lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]);
michael@0 1847 lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]);
michael@0 1848 lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]);
michael@0 1849 lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]);
michael@0 1850 lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]);
michael@0 1851 lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]);
michael@0 1852 }
michael@0 1853 {
michael@0 1854 const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64);
michael@0 1855 const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64);
michael@0 1856 const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64);
michael@0 1857 const __m128i k32_m12_m20 = pair_set_epi32(-cospi_12_64,
michael@0 1858 -cospi_20_64);
michael@0 1859 const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64);
michael@0 1860 const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64);
michael@0 1861
michael@0 1862 u[ 0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]);
michael@0 1863 u[ 1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]);
michael@0 1864 u[ 2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]);
michael@0 1865 u[ 3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]);
michael@0 1866 u[ 4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]);
michael@0 1867 u[ 5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]);
michael@0 1868 u[ 6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]);
michael@0 1869 u[ 7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]);
michael@0 1870 u[ 8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]);
michael@0 1871 u[ 9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]);
michael@0 1872 u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]);
michael@0 1873 u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]);
michael@0 1874 u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]);
michael@0 1875 u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]);
michael@0 1876 u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]);
michael@0 1877 u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]);
michael@0 1878
michael@0 1879 v[ 0] = k_madd_epi32(u[ 0], k32_m04_p28);
michael@0 1880 v[ 1] = k_madd_epi32(u[ 1], k32_m04_p28);
michael@0 1881 v[ 2] = k_madd_epi32(u[ 2], k32_m04_p28);
michael@0 1882 v[ 3] = k_madd_epi32(u[ 3], k32_m04_p28);
michael@0 1883 v[ 4] = k_madd_epi32(u[ 4], k32_m28_m04);
michael@0 1884 v[ 5] = k_madd_epi32(u[ 5], k32_m28_m04);
michael@0 1885 v[ 6] = k_madd_epi32(u[ 6], k32_m28_m04);
michael@0 1886 v[ 7] = k_madd_epi32(u[ 7], k32_m28_m04);
michael@0 1887 v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12);
michael@0 1888 v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12);
michael@0 1889 v[10] = k_madd_epi32(u[10], k32_m20_p12);
michael@0 1890 v[11] = k_madd_epi32(u[11], k32_m20_p12);
michael@0 1891 v[12] = k_madd_epi32(u[12], k32_m12_m20);
michael@0 1892 v[13] = k_madd_epi32(u[13], k32_m12_m20);
michael@0 1893 v[14] = k_madd_epi32(u[14], k32_m12_m20);
michael@0 1894 v[15] = k_madd_epi32(u[15], k32_m12_m20);
michael@0 1895 v[16] = k_madd_epi32(u[12], k32_m20_p12);
michael@0 1896 v[17] = k_madd_epi32(u[13], k32_m20_p12);
michael@0 1897 v[18] = k_madd_epi32(u[14], k32_m20_p12);
michael@0 1898 v[19] = k_madd_epi32(u[15], k32_m20_p12);
michael@0 1899 v[20] = k_madd_epi32(u[ 8], k32_p12_p20);
michael@0 1900 v[21] = k_madd_epi32(u[ 9], k32_p12_p20);
michael@0 1901 v[22] = k_madd_epi32(u[10], k32_p12_p20);
michael@0 1902 v[23] = k_madd_epi32(u[11], k32_p12_p20);
michael@0 1903 v[24] = k_madd_epi32(u[ 4], k32_m04_p28);
michael@0 1904 v[25] = k_madd_epi32(u[ 5], k32_m04_p28);
michael@0 1905 v[26] = k_madd_epi32(u[ 6], k32_m04_p28);
michael@0 1906 v[27] = k_madd_epi32(u[ 7], k32_m04_p28);
michael@0 1907 v[28] = k_madd_epi32(u[ 0], k32_p28_p04);
michael@0 1908 v[29] = k_madd_epi32(u[ 1], k32_p28_p04);
michael@0 1909 v[30] = k_madd_epi32(u[ 2], k32_p28_p04);
michael@0 1910 v[31] = k_madd_epi32(u[ 3], k32_p28_p04);
michael@0 1911
michael@0 1912 u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
michael@0 1913 u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
michael@0 1914 u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
michael@0 1915 u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
michael@0 1916 u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
michael@0 1917 u[ 5] = k_packs_epi64(v[10], v[11]);
michael@0 1918 u[ 6] = k_packs_epi64(v[12], v[13]);
michael@0 1919 u[ 7] = k_packs_epi64(v[14], v[15]);
michael@0 1920 u[ 8] = k_packs_epi64(v[16], v[17]);
michael@0 1921 u[ 9] = k_packs_epi64(v[18], v[19]);
michael@0 1922 u[10] = k_packs_epi64(v[20], v[21]);
michael@0 1923 u[11] = k_packs_epi64(v[22], v[23]);
michael@0 1924 u[12] = k_packs_epi64(v[24], v[25]);
michael@0 1925 u[13] = k_packs_epi64(v[26], v[27]);
michael@0 1926 u[14] = k_packs_epi64(v[28], v[29]);
michael@0 1927 u[15] = k_packs_epi64(v[30], v[31]);
michael@0 1928
michael@0 1929 v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
michael@0 1930 v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
michael@0 1931 v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
michael@0 1932 v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
michael@0 1933 v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
michael@0 1934 v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
michael@0 1935 v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
michael@0 1936 v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
michael@0 1937 v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
michael@0 1938 v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
michael@0 1939 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
michael@0 1940 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
michael@0 1941 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
michael@0 1942 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
michael@0 1943 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
michael@0 1944 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
michael@0 1945
michael@0 1946 lstep3[34] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
michael@0 1947 lstep3[35] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
michael@0 1948 lstep3[36] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
michael@0 1949 lstep3[37] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
michael@0 1950 lstep3[42] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
michael@0 1951 lstep3[43] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
michael@0 1952 lstep3[44] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
michael@0 1953 lstep3[45] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
michael@0 1954 lstep3[50] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
michael@0 1955 lstep3[51] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
michael@0 1956 lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
michael@0 1957 lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
michael@0 1958 lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
michael@0 1959 lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
michael@0 1960 lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
michael@0 1961 lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
michael@0 1962 }
michael@0 1963 // stage 7
michael@0 1964 {
michael@0 1965 const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64);
michael@0 1966 const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64);
michael@0 1967 const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64);
michael@0 1968 const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64);
michael@0 1969 const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64);
michael@0 1970 const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64);
michael@0 1971 const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64);
michael@0 1972 const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64);
michael@0 1973
michael@0 1974 u[ 0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]);
michael@0 1975 u[ 1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]);
michael@0 1976 u[ 2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]);
michael@0 1977 u[ 3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]);
michael@0 1978 u[ 4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]);
michael@0 1979 u[ 5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]);
michael@0 1980 u[ 6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]);
michael@0 1981 u[ 7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]);
michael@0 1982 u[ 8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]);
michael@0 1983 u[ 9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]);
michael@0 1984 u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]);
michael@0 1985 u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]);
michael@0 1986 u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]);
michael@0 1987 u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]);
michael@0 1988 u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]);
michael@0 1989 u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]);
michael@0 1990
michael@0 1991 v[ 0] = k_madd_epi32(u[ 0], k32_p30_p02);
michael@0 1992 v[ 1] = k_madd_epi32(u[ 1], k32_p30_p02);
michael@0 1993 v[ 2] = k_madd_epi32(u[ 2], k32_p30_p02);
michael@0 1994 v[ 3] = k_madd_epi32(u[ 3], k32_p30_p02);
michael@0 1995 v[ 4] = k_madd_epi32(u[ 4], k32_p14_p18);
michael@0 1996 v[ 5] = k_madd_epi32(u[ 5], k32_p14_p18);
michael@0 1997 v[ 6] = k_madd_epi32(u[ 6], k32_p14_p18);
michael@0 1998 v[ 7] = k_madd_epi32(u[ 7], k32_p14_p18);
michael@0 1999 v[ 8] = k_madd_epi32(u[ 8], k32_p22_p10);
michael@0 2000 v[ 9] = k_madd_epi32(u[ 9], k32_p22_p10);
michael@0 2001 v[10] = k_madd_epi32(u[10], k32_p22_p10);
michael@0 2002 v[11] = k_madd_epi32(u[11], k32_p22_p10);
michael@0 2003 v[12] = k_madd_epi32(u[12], k32_p06_p26);
michael@0 2004 v[13] = k_madd_epi32(u[13], k32_p06_p26);
michael@0 2005 v[14] = k_madd_epi32(u[14], k32_p06_p26);
michael@0 2006 v[15] = k_madd_epi32(u[15], k32_p06_p26);
michael@0 2007 v[16] = k_madd_epi32(u[12], k32_m26_p06);
michael@0 2008 v[17] = k_madd_epi32(u[13], k32_m26_p06);
michael@0 2009 v[18] = k_madd_epi32(u[14], k32_m26_p06);
michael@0 2010 v[19] = k_madd_epi32(u[15], k32_m26_p06);
michael@0 2011 v[20] = k_madd_epi32(u[ 8], k32_m10_p22);
michael@0 2012 v[21] = k_madd_epi32(u[ 9], k32_m10_p22);
michael@0 2013 v[22] = k_madd_epi32(u[10], k32_m10_p22);
michael@0 2014 v[23] = k_madd_epi32(u[11], k32_m10_p22);
michael@0 2015 v[24] = k_madd_epi32(u[ 4], k32_m18_p14);
michael@0 2016 v[25] = k_madd_epi32(u[ 5], k32_m18_p14);
michael@0 2017 v[26] = k_madd_epi32(u[ 6], k32_m18_p14);
michael@0 2018 v[27] = k_madd_epi32(u[ 7], k32_m18_p14);
michael@0 2019 v[28] = k_madd_epi32(u[ 0], k32_m02_p30);
michael@0 2020 v[29] = k_madd_epi32(u[ 1], k32_m02_p30);
michael@0 2021 v[30] = k_madd_epi32(u[ 2], k32_m02_p30);
michael@0 2022 v[31] = k_madd_epi32(u[ 3], k32_m02_p30);
michael@0 2023
michael@0 2024 u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
michael@0 2025 u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
michael@0 2026 u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
michael@0 2027 u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
michael@0 2028 u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
michael@0 2029 u[ 5] = k_packs_epi64(v[10], v[11]);
michael@0 2030 u[ 6] = k_packs_epi64(v[12], v[13]);
michael@0 2031 u[ 7] = k_packs_epi64(v[14], v[15]);
michael@0 2032 u[ 8] = k_packs_epi64(v[16], v[17]);
michael@0 2033 u[ 9] = k_packs_epi64(v[18], v[19]);
michael@0 2034 u[10] = k_packs_epi64(v[20], v[21]);
michael@0 2035 u[11] = k_packs_epi64(v[22], v[23]);
michael@0 2036 u[12] = k_packs_epi64(v[24], v[25]);
michael@0 2037 u[13] = k_packs_epi64(v[26], v[27]);
michael@0 2038 u[14] = k_packs_epi64(v[28], v[29]);
michael@0 2039 u[15] = k_packs_epi64(v[30], v[31]);
michael@0 2040
michael@0 2041 v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
michael@0 2042 v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
michael@0 2043 v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
michael@0 2044 v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
michael@0 2045 v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
michael@0 2046 v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
michael@0 2047 v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
michael@0 2048 v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
michael@0 2049 v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
michael@0 2050 v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
michael@0 2051 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
michael@0 2052 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
michael@0 2053 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
michael@0 2054 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
michael@0 2055 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
michael@0 2056 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
michael@0 2057
michael@0 2058 u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
michael@0 2059 u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
michael@0 2060 u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
michael@0 2061 u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
michael@0 2062 u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
michael@0 2063 u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
michael@0 2064 u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
michael@0 2065 u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
michael@0 2066 u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
michael@0 2067 u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
michael@0 2068 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
michael@0 2069 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
michael@0 2070 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
michael@0 2071 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
michael@0 2072 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
michael@0 2073 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
michael@0 2074
michael@0 2075 v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
michael@0 2076 v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
michael@0 2077 v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
michael@0 2078 v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
michael@0 2079 v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
michael@0 2080 v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
michael@0 2081 v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
michael@0 2082 v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
michael@0 2083 v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
michael@0 2084 v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
michael@0 2085 v[10] = _mm_cmplt_epi32(u[10], kZero);
michael@0 2086 v[11] = _mm_cmplt_epi32(u[11], kZero);
michael@0 2087 v[12] = _mm_cmplt_epi32(u[12], kZero);
michael@0 2088 v[13] = _mm_cmplt_epi32(u[13], kZero);
michael@0 2089 v[14] = _mm_cmplt_epi32(u[14], kZero);
michael@0 2090 v[15] = _mm_cmplt_epi32(u[15], kZero);
michael@0 2091
michael@0 2092 u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
michael@0 2093 u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
michael@0 2094 u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
michael@0 2095 u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
michael@0 2096 u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
michael@0 2097 u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
michael@0 2098 u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
michael@0 2099 u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
michael@0 2100 u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
michael@0 2101 u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
michael@0 2102 u[10] = _mm_sub_epi32(u[10], v[10]);
michael@0 2103 u[11] = _mm_sub_epi32(u[11], v[11]);
michael@0 2104 u[12] = _mm_sub_epi32(u[12], v[12]);
michael@0 2105 u[13] = _mm_sub_epi32(u[13], v[13]);
michael@0 2106 u[14] = _mm_sub_epi32(u[14], v[14]);
michael@0 2107 u[15] = _mm_sub_epi32(u[15], v[15]);
michael@0 2108
michael@0 2109 v[ 0] = _mm_add_epi32(u[ 0], K32One);
michael@0 2110 v[ 1] = _mm_add_epi32(u[ 1], K32One);
michael@0 2111 v[ 2] = _mm_add_epi32(u[ 2], K32One);
michael@0 2112 v[ 3] = _mm_add_epi32(u[ 3], K32One);
michael@0 2113 v[ 4] = _mm_add_epi32(u[ 4], K32One);
michael@0 2114 v[ 5] = _mm_add_epi32(u[ 5], K32One);
michael@0 2115 v[ 6] = _mm_add_epi32(u[ 6], K32One);
michael@0 2116 v[ 7] = _mm_add_epi32(u[ 7], K32One);
michael@0 2117 v[ 8] = _mm_add_epi32(u[ 8], K32One);
michael@0 2118 v[ 9] = _mm_add_epi32(u[ 9], K32One);
michael@0 2119 v[10] = _mm_add_epi32(u[10], K32One);
michael@0 2120 v[11] = _mm_add_epi32(u[11], K32One);
michael@0 2121 v[12] = _mm_add_epi32(u[12], K32One);
michael@0 2122 v[13] = _mm_add_epi32(u[13], K32One);
michael@0 2123 v[14] = _mm_add_epi32(u[14], K32One);
michael@0 2124 v[15] = _mm_add_epi32(u[15], K32One);
michael@0 2125
michael@0 2126 u[ 0] = _mm_srai_epi32(v[ 0], 2);
michael@0 2127 u[ 1] = _mm_srai_epi32(v[ 1], 2);
michael@0 2128 u[ 2] = _mm_srai_epi32(v[ 2], 2);
michael@0 2129 u[ 3] = _mm_srai_epi32(v[ 3], 2);
michael@0 2130 u[ 4] = _mm_srai_epi32(v[ 4], 2);
michael@0 2131 u[ 5] = _mm_srai_epi32(v[ 5], 2);
michael@0 2132 u[ 6] = _mm_srai_epi32(v[ 6], 2);
michael@0 2133 u[ 7] = _mm_srai_epi32(v[ 7], 2);
michael@0 2134 u[ 8] = _mm_srai_epi32(v[ 8], 2);
michael@0 2135 u[ 9] = _mm_srai_epi32(v[ 9], 2);
michael@0 2136 u[10] = _mm_srai_epi32(v[10], 2);
michael@0 2137 u[11] = _mm_srai_epi32(v[11], 2);
michael@0 2138 u[12] = _mm_srai_epi32(v[12], 2);
michael@0 2139 u[13] = _mm_srai_epi32(v[13], 2);
michael@0 2140 u[14] = _mm_srai_epi32(v[14], 2);
michael@0 2141 u[15] = _mm_srai_epi32(v[15], 2);
michael@0 2142
michael@0 2143 out[ 2] = _mm_packs_epi32(u[0], u[1]);
michael@0 2144 out[18] = _mm_packs_epi32(u[2], u[3]);
michael@0 2145 out[10] = _mm_packs_epi32(u[4], u[5]);
michael@0 2146 out[26] = _mm_packs_epi32(u[6], u[7]);
michael@0 2147 out[ 6] = _mm_packs_epi32(u[8], u[9]);
michael@0 2148 out[22] = _mm_packs_epi32(u[10], u[11]);
michael@0 2149 out[14] = _mm_packs_epi32(u[12], u[13]);
michael@0 2150 out[30] = _mm_packs_epi32(u[14], u[15]);
michael@0 2151 }
michael@0 2152 {
michael@0 2153 lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]);
michael@0 2154 lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]);
michael@0 2155 lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]);
michael@0 2156 lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]);
michael@0 2157 lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]);
michael@0 2158 lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]);
michael@0 2159 lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]);
michael@0 2160 lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]);
michael@0 2161 lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]);
michael@0 2162 lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]);
michael@0 2163 lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]);
michael@0 2164 lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]);
michael@0 2165 lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]);
michael@0 2166 lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]);
michael@0 2167 lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]);
michael@0 2168 lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]);
michael@0 2169 lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]);
michael@0 2170 lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]);
michael@0 2171 lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]);
michael@0 2172 lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]);
michael@0 2173 lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]);
michael@0 2174 lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]);
michael@0 2175 lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]);
michael@0 2176 lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]);
michael@0 2177 lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]);
michael@0 2178 lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]);
michael@0 2179 lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]);
michael@0 2180 lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]);
michael@0 2181 lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]);
michael@0 2182 lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]);
michael@0 2183 lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]);
michael@0 2184 lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]);
michael@0 2185 }
michael@0 2186 // stage 8
michael@0 2187 {
michael@0 2188 const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64);
michael@0 2189 const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64);
michael@0 2190 const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64);
michael@0 2191 const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64);
michael@0 2192 const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64);
michael@0 2193 const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64);
michael@0 2194 const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64);
michael@0 2195 const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64);
michael@0 2196
michael@0 2197 u[ 0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]);
michael@0 2198 u[ 1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]);
michael@0 2199 u[ 2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]);
michael@0 2200 u[ 3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]);
michael@0 2201 u[ 4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]);
michael@0 2202 u[ 5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]);
michael@0 2203 u[ 6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]);
michael@0 2204 u[ 7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]);
michael@0 2205 u[ 8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]);
michael@0 2206 u[ 9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]);
michael@0 2207 u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]);
michael@0 2208 u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]);
michael@0 2209 u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]);
michael@0 2210 u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]);
michael@0 2211 u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]);
michael@0 2212 u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]);
michael@0 2213
michael@0 2214 v[ 0] = k_madd_epi32(u[ 0], k32_p31_p01);
michael@0 2215 v[ 1] = k_madd_epi32(u[ 1], k32_p31_p01);
michael@0 2216 v[ 2] = k_madd_epi32(u[ 2], k32_p31_p01);
michael@0 2217 v[ 3] = k_madd_epi32(u[ 3], k32_p31_p01);
michael@0 2218 v[ 4] = k_madd_epi32(u[ 4], k32_p15_p17);
michael@0 2219 v[ 5] = k_madd_epi32(u[ 5], k32_p15_p17);
michael@0 2220 v[ 6] = k_madd_epi32(u[ 6], k32_p15_p17);
michael@0 2221 v[ 7] = k_madd_epi32(u[ 7], k32_p15_p17);
michael@0 2222 v[ 8] = k_madd_epi32(u[ 8], k32_p23_p09);
michael@0 2223 v[ 9] = k_madd_epi32(u[ 9], k32_p23_p09);
michael@0 2224 v[10] = k_madd_epi32(u[10], k32_p23_p09);
michael@0 2225 v[11] = k_madd_epi32(u[11], k32_p23_p09);
michael@0 2226 v[12] = k_madd_epi32(u[12], k32_p07_p25);
michael@0 2227 v[13] = k_madd_epi32(u[13], k32_p07_p25);
michael@0 2228 v[14] = k_madd_epi32(u[14], k32_p07_p25);
michael@0 2229 v[15] = k_madd_epi32(u[15], k32_p07_p25);
michael@0 2230 v[16] = k_madd_epi32(u[12], k32_m25_p07);
michael@0 2231 v[17] = k_madd_epi32(u[13], k32_m25_p07);
michael@0 2232 v[18] = k_madd_epi32(u[14], k32_m25_p07);
michael@0 2233 v[19] = k_madd_epi32(u[15], k32_m25_p07);
michael@0 2234 v[20] = k_madd_epi32(u[ 8], k32_m09_p23);
michael@0 2235 v[21] = k_madd_epi32(u[ 9], k32_m09_p23);
michael@0 2236 v[22] = k_madd_epi32(u[10], k32_m09_p23);
michael@0 2237 v[23] = k_madd_epi32(u[11], k32_m09_p23);
michael@0 2238 v[24] = k_madd_epi32(u[ 4], k32_m17_p15);
michael@0 2239 v[25] = k_madd_epi32(u[ 5], k32_m17_p15);
michael@0 2240 v[26] = k_madd_epi32(u[ 6], k32_m17_p15);
michael@0 2241 v[27] = k_madd_epi32(u[ 7], k32_m17_p15);
michael@0 2242 v[28] = k_madd_epi32(u[ 0], k32_m01_p31);
michael@0 2243 v[29] = k_madd_epi32(u[ 1], k32_m01_p31);
michael@0 2244 v[30] = k_madd_epi32(u[ 2], k32_m01_p31);
michael@0 2245 v[31] = k_madd_epi32(u[ 3], k32_m01_p31);
michael@0 2246
michael@0 2247 u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
michael@0 2248 u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
michael@0 2249 u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
michael@0 2250 u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
michael@0 2251 u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
michael@0 2252 u[ 5] = k_packs_epi64(v[10], v[11]);
michael@0 2253 u[ 6] = k_packs_epi64(v[12], v[13]);
michael@0 2254 u[ 7] = k_packs_epi64(v[14], v[15]);
michael@0 2255 u[ 8] = k_packs_epi64(v[16], v[17]);
michael@0 2256 u[ 9] = k_packs_epi64(v[18], v[19]);
michael@0 2257 u[10] = k_packs_epi64(v[20], v[21]);
michael@0 2258 u[11] = k_packs_epi64(v[22], v[23]);
michael@0 2259 u[12] = k_packs_epi64(v[24], v[25]);
michael@0 2260 u[13] = k_packs_epi64(v[26], v[27]);
michael@0 2261 u[14] = k_packs_epi64(v[28], v[29]);
michael@0 2262 u[15] = k_packs_epi64(v[30], v[31]);
michael@0 2263
michael@0 2264 v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
michael@0 2265 v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
michael@0 2266 v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
michael@0 2267 v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
michael@0 2268 v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
michael@0 2269 v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
michael@0 2270 v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
michael@0 2271 v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
michael@0 2272 v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
michael@0 2273 v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
michael@0 2274 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
michael@0 2275 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
michael@0 2276 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
michael@0 2277 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
michael@0 2278 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
michael@0 2279 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
michael@0 2280
michael@0 2281 u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
michael@0 2282 u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
michael@0 2283 u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
michael@0 2284 u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
michael@0 2285 u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
michael@0 2286 u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
michael@0 2287 u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
michael@0 2288 u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
michael@0 2289 u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
michael@0 2290 u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
michael@0 2291 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
michael@0 2292 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
michael@0 2293 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
michael@0 2294 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
michael@0 2295 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
michael@0 2296 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
michael@0 2297
michael@0 2298 v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
michael@0 2299 v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
michael@0 2300 v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
michael@0 2301 v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
michael@0 2302 v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
michael@0 2303 v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
michael@0 2304 v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
michael@0 2305 v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
michael@0 2306 v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
michael@0 2307 v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
michael@0 2308 v[10] = _mm_cmplt_epi32(u[10], kZero);
michael@0 2309 v[11] = _mm_cmplt_epi32(u[11], kZero);
michael@0 2310 v[12] = _mm_cmplt_epi32(u[12], kZero);
michael@0 2311 v[13] = _mm_cmplt_epi32(u[13], kZero);
michael@0 2312 v[14] = _mm_cmplt_epi32(u[14], kZero);
michael@0 2313 v[15] = _mm_cmplt_epi32(u[15], kZero);
michael@0 2314
michael@0 2315 u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
michael@0 2316 u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
michael@0 2317 u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
michael@0 2318 u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
michael@0 2319 u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
michael@0 2320 u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
michael@0 2321 u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
michael@0 2322 u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
michael@0 2323 u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
michael@0 2324 u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
michael@0 2325 u[10] = _mm_sub_epi32(u[10], v[10]);
michael@0 2326 u[11] = _mm_sub_epi32(u[11], v[11]);
michael@0 2327 u[12] = _mm_sub_epi32(u[12], v[12]);
michael@0 2328 u[13] = _mm_sub_epi32(u[13], v[13]);
michael@0 2329 u[14] = _mm_sub_epi32(u[14], v[14]);
michael@0 2330 u[15] = _mm_sub_epi32(u[15], v[15]);
michael@0 2331
michael@0 2332 v[0] = _mm_add_epi32(u[0], K32One);
michael@0 2333 v[1] = _mm_add_epi32(u[1], K32One);
michael@0 2334 v[2] = _mm_add_epi32(u[2], K32One);
michael@0 2335 v[3] = _mm_add_epi32(u[3], K32One);
michael@0 2336 v[4] = _mm_add_epi32(u[4], K32One);
michael@0 2337 v[5] = _mm_add_epi32(u[5], K32One);
michael@0 2338 v[6] = _mm_add_epi32(u[6], K32One);
michael@0 2339 v[7] = _mm_add_epi32(u[7], K32One);
michael@0 2340 v[8] = _mm_add_epi32(u[8], K32One);
michael@0 2341 v[9] = _mm_add_epi32(u[9], K32One);
michael@0 2342 v[10] = _mm_add_epi32(u[10], K32One);
michael@0 2343 v[11] = _mm_add_epi32(u[11], K32One);
michael@0 2344 v[12] = _mm_add_epi32(u[12], K32One);
michael@0 2345 v[13] = _mm_add_epi32(u[13], K32One);
michael@0 2346 v[14] = _mm_add_epi32(u[14], K32One);
michael@0 2347 v[15] = _mm_add_epi32(u[15], K32One);
michael@0 2348
michael@0 2349 u[0] = _mm_srai_epi32(v[0], 2);
michael@0 2350 u[1] = _mm_srai_epi32(v[1], 2);
michael@0 2351 u[2] = _mm_srai_epi32(v[2], 2);
michael@0 2352 u[3] = _mm_srai_epi32(v[3], 2);
michael@0 2353 u[4] = _mm_srai_epi32(v[4], 2);
michael@0 2354 u[5] = _mm_srai_epi32(v[5], 2);
michael@0 2355 u[6] = _mm_srai_epi32(v[6], 2);
michael@0 2356 u[7] = _mm_srai_epi32(v[7], 2);
michael@0 2357 u[8] = _mm_srai_epi32(v[8], 2);
michael@0 2358 u[9] = _mm_srai_epi32(v[9], 2);
michael@0 2359 u[10] = _mm_srai_epi32(v[10], 2);
michael@0 2360 u[11] = _mm_srai_epi32(v[11], 2);
michael@0 2361 u[12] = _mm_srai_epi32(v[12], 2);
michael@0 2362 u[13] = _mm_srai_epi32(v[13], 2);
michael@0 2363 u[14] = _mm_srai_epi32(v[14], 2);
michael@0 2364 u[15] = _mm_srai_epi32(v[15], 2);
michael@0 2365
michael@0 2366 out[ 1] = _mm_packs_epi32(u[0], u[1]);
michael@0 2367 out[17] = _mm_packs_epi32(u[2], u[3]);
michael@0 2368 out[ 9] = _mm_packs_epi32(u[4], u[5]);
michael@0 2369 out[25] = _mm_packs_epi32(u[6], u[7]);
michael@0 2370 out[ 7] = _mm_packs_epi32(u[8], u[9]);
michael@0 2371 out[23] = _mm_packs_epi32(u[10], u[11]);
michael@0 2372 out[15] = _mm_packs_epi32(u[12], u[13]);
michael@0 2373 out[31] = _mm_packs_epi32(u[14], u[15]);
michael@0 2374 }
michael@0 2375 {
michael@0 2376 const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64);
michael@0 2377 const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64);
michael@0 2378 const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64);
michael@0 2379 const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64);
michael@0 2380 const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64);
michael@0 2381 const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64);
michael@0 2382 const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64);
michael@0 2383 const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64);
michael@0 2384
michael@0 2385 u[ 0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]);
michael@0 2386 u[ 1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]);
michael@0 2387 u[ 2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]);
michael@0 2388 u[ 3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]);
michael@0 2389 u[ 4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]);
michael@0 2390 u[ 5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]);
michael@0 2391 u[ 6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]);
michael@0 2392 u[ 7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]);
michael@0 2393 u[ 8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]);
michael@0 2394 u[ 9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]);
michael@0 2395 u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]);
michael@0 2396 u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]);
michael@0 2397 u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]);
michael@0 2398 u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]);
michael@0 2399 u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]);
michael@0 2400 u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]);
michael@0 2401
michael@0 2402 v[ 0] = k_madd_epi32(u[ 0], k32_p27_p05);
michael@0 2403 v[ 1] = k_madd_epi32(u[ 1], k32_p27_p05);
michael@0 2404 v[ 2] = k_madd_epi32(u[ 2], k32_p27_p05);
michael@0 2405 v[ 3] = k_madd_epi32(u[ 3], k32_p27_p05);
michael@0 2406 v[ 4] = k_madd_epi32(u[ 4], k32_p11_p21);
michael@0 2407 v[ 5] = k_madd_epi32(u[ 5], k32_p11_p21);
michael@0 2408 v[ 6] = k_madd_epi32(u[ 6], k32_p11_p21);
michael@0 2409 v[ 7] = k_madd_epi32(u[ 7], k32_p11_p21);
michael@0 2410 v[ 8] = k_madd_epi32(u[ 8], k32_p19_p13);
michael@0 2411 v[ 9] = k_madd_epi32(u[ 9], k32_p19_p13);
michael@0 2412 v[10] = k_madd_epi32(u[10], k32_p19_p13);
michael@0 2413 v[11] = k_madd_epi32(u[11], k32_p19_p13);
michael@0 2414 v[12] = k_madd_epi32(u[12], k32_p03_p29);
michael@0 2415 v[13] = k_madd_epi32(u[13], k32_p03_p29);
michael@0 2416 v[14] = k_madd_epi32(u[14], k32_p03_p29);
michael@0 2417 v[15] = k_madd_epi32(u[15], k32_p03_p29);
michael@0 2418 v[16] = k_madd_epi32(u[12], k32_m29_p03);
michael@0 2419 v[17] = k_madd_epi32(u[13], k32_m29_p03);
michael@0 2420 v[18] = k_madd_epi32(u[14], k32_m29_p03);
michael@0 2421 v[19] = k_madd_epi32(u[15], k32_m29_p03);
michael@0 2422 v[20] = k_madd_epi32(u[ 8], k32_m13_p19);
michael@0 2423 v[21] = k_madd_epi32(u[ 9], k32_m13_p19);
michael@0 2424 v[22] = k_madd_epi32(u[10], k32_m13_p19);
michael@0 2425 v[23] = k_madd_epi32(u[11], k32_m13_p19);
michael@0 2426 v[24] = k_madd_epi32(u[ 4], k32_m21_p11);
michael@0 2427 v[25] = k_madd_epi32(u[ 5], k32_m21_p11);
michael@0 2428 v[26] = k_madd_epi32(u[ 6], k32_m21_p11);
michael@0 2429 v[27] = k_madd_epi32(u[ 7], k32_m21_p11);
michael@0 2430 v[28] = k_madd_epi32(u[ 0], k32_m05_p27);
michael@0 2431 v[29] = k_madd_epi32(u[ 1], k32_m05_p27);
michael@0 2432 v[30] = k_madd_epi32(u[ 2], k32_m05_p27);
michael@0 2433 v[31] = k_madd_epi32(u[ 3], k32_m05_p27);
michael@0 2434
michael@0 2435 u[ 0] = k_packs_epi64(v[ 0], v[ 1]);
michael@0 2436 u[ 1] = k_packs_epi64(v[ 2], v[ 3]);
michael@0 2437 u[ 2] = k_packs_epi64(v[ 4], v[ 5]);
michael@0 2438 u[ 3] = k_packs_epi64(v[ 6], v[ 7]);
michael@0 2439 u[ 4] = k_packs_epi64(v[ 8], v[ 9]);
michael@0 2440 u[ 5] = k_packs_epi64(v[10], v[11]);
michael@0 2441 u[ 6] = k_packs_epi64(v[12], v[13]);
michael@0 2442 u[ 7] = k_packs_epi64(v[14], v[15]);
michael@0 2443 u[ 8] = k_packs_epi64(v[16], v[17]);
michael@0 2444 u[ 9] = k_packs_epi64(v[18], v[19]);
michael@0 2445 u[10] = k_packs_epi64(v[20], v[21]);
michael@0 2446 u[11] = k_packs_epi64(v[22], v[23]);
michael@0 2447 u[12] = k_packs_epi64(v[24], v[25]);
michael@0 2448 u[13] = k_packs_epi64(v[26], v[27]);
michael@0 2449 u[14] = k_packs_epi64(v[28], v[29]);
michael@0 2450 u[15] = k_packs_epi64(v[30], v[31]);
michael@0 2451
michael@0 2452 v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING);
michael@0 2453 v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING);
michael@0 2454 v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING);
michael@0 2455 v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING);
michael@0 2456 v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING);
michael@0 2457 v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING);
michael@0 2458 v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING);
michael@0 2459 v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING);
michael@0 2460 v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING);
michael@0 2461 v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING);
michael@0 2462 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
michael@0 2463 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
michael@0 2464 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
michael@0 2465 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
michael@0 2466 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
michael@0 2467 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
michael@0 2468
michael@0 2469 u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS);
michael@0 2470 u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS);
michael@0 2471 u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS);
michael@0 2472 u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS);
michael@0 2473 u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS);
michael@0 2474 u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS);
michael@0 2475 u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS);
michael@0 2476 u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS);
michael@0 2477 u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS);
michael@0 2478 u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS);
michael@0 2479 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
michael@0 2480 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
michael@0 2481 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
michael@0 2482 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
michael@0 2483 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
michael@0 2484 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
michael@0 2485
michael@0 2486 v[ 0] = _mm_cmplt_epi32(u[ 0], kZero);
michael@0 2487 v[ 1] = _mm_cmplt_epi32(u[ 1], kZero);
michael@0 2488 v[ 2] = _mm_cmplt_epi32(u[ 2], kZero);
michael@0 2489 v[ 3] = _mm_cmplt_epi32(u[ 3], kZero);
michael@0 2490 v[ 4] = _mm_cmplt_epi32(u[ 4], kZero);
michael@0 2491 v[ 5] = _mm_cmplt_epi32(u[ 5], kZero);
michael@0 2492 v[ 6] = _mm_cmplt_epi32(u[ 6], kZero);
michael@0 2493 v[ 7] = _mm_cmplt_epi32(u[ 7], kZero);
michael@0 2494 v[ 8] = _mm_cmplt_epi32(u[ 8], kZero);
michael@0 2495 v[ 9] = _mm_cmplt_epi32(u[ 9], kZero);
michael@0 2496 v[10] = _mm_cmplt_epi32(u[10], kZero);
michael@0 2497 v[11] = _mm_cmplt_epi32(u[11], kZero);
michael@0 2498 v[12] = _mm_cmplt_epi32(u[12], kZero);
michael@0 2499 v[13] = _mm_cmplt_epi32(u[13], kZero);
michael@0 2500 v[14] = _mm_cmplt_epi32(u[14], kZero);
michael@0 2501 v[15] = _mm_cmplt_epi32(u[15], kZero);
michael@0 2502
michael@0 2503 u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]);
michael@0 2504 u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]);
michael@0 2505 u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]);
michael@0 2506 u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]);
michael@0 2507 u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]);
michael@0 2508 u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]);
michael@0 2509 u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]);
michael@0 2510 u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]);
michael@0 2511 u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]);
michael@0 2512 u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]);
michael@0 2513 u[10] = _mm_sub_epi32(u[10], v[10]);
michael@0 2514 u[11] = _mm_sub_epi32(u[11], v[11]);
michael@0 2515 u[12] = _mm_sub_epi32(u[12], v[12]);
michael@0 2516 u[13] = _mm_sub_epi32(u[13], v[13]);
michael@0 2517 u[14] = _mm_sub_epi32(u[14], v[14]);
michael@0 2518 u[15] = _mm_sub_epi32(u[15], v[15]);
michael@0 2519
michael@0 2520 v[0] = _mm_add_epi32(u[0], K32One);
michael@0 2521 v[1] = _mm_add_epi32(u[1], K32One);
michael@0 2522 v[2] = _mm_add_epi32(u[2], K32One);
michael@0 2523 v[3] = _mm_add_epi32(u[3], K32One);
michael@0 2524 v[4] = _mm_add_epi32(u[4], K32One);
michael@0 2525 v[5] = _mm_add_epi32(u[5], K32One);
michael@0 2526 v[6] = _mm_add_epi32(u[6], K32One);
michael@0 2527 v[7] = _mm_add_epi32(u[7], K32One);
michael@0 2528 v[8] = _mm_add_epi32(u[8], K32One);
michael@0 2529 v[9] = _mm_add_epi32(u[9], K32One);
michael@0 2530 v[10] = _mm_add_epi32(u[10], K32One);
michael@0 2531 v[11] = _mm_add_epi32(u[11], K32One);
michael@0 2532 v[12] = _mm_add_epi32(u[12], K32One);
michael@0 2533 v[13] = _mm_add_epi32(u[13], K32One);
michael@0 2534 v[14] = _mm_add_epi32(u[14], K32One);
michael@0 2535 v[15] = _mm_add_epi32(u[15], K32One);
michael@0 2536
michael@0 2537 u[0] = _mm_srai_epi32(v[0], 2);
michael@0 2538 u[1] = _mm_srai_epi32(v[1], 2);
michael@0 2539 u[2] = _mm_srai_epi32(v[2], 2);
michael@0 2540 u[3] = _mm_srai_epi32(v[3], 2);
michael@0 2541 u[4] = _mm_srai_epi32(v[4], 2);
michael@0 2542 u[5] = _mm_srai_epi32(v[5], 2);
michael@0 2543 u[6] = _mm_srai_epi32(v[6], 2);
michael@0 2544 u[7] = _mm_srai_epi32(v[7], 2);
michael@0 2545 u[8] = _mm_srai_epi32(v[8], 2);
michael@0 2546 u[9] = _mm_srai_epi32(v[9], 2);
michael@0 2547 u[10] = _mm_srai_epi32(v[10], 2);
michael@0 2548 u[11] = _mm_srai_epi32(v[11], 2);
michael@0 2549 u[12] = _mm_srai_epi32(v[12], 2);
michael@0 2550 u[13] = _mm_srai_epi32(v[13], 2);
michael@0 2551 u[14] = _mm_srai_epi32(v[14], 2);
michael@0 2552 u[15] = _mm_srai_epi32(v[15], 2);
michael@0 2553
michael@0 2554 out[ 5] = _mm_packs_epi32(u[0], u[1]);
michael@0 2555 out[21] = _mm_packs_epi32(u[2], u[3]);
michael@0 2556 out[13] = _mm_packs_epi32(u[4], u[5]);
michael@0 2557 out[29] = _mm_packs_epi32(u[6], u[7]);
michael@0 2558 out[ 3] = _mm_packs_epi32(u[8], u[9]);
michael@0 2559 out[19] = _mm_packs_epi32(u[10], u[11]);
michael@0 2560 out[11] = _mm_packs_epi32(u[12], u[13]);
michael@0 2561 out[27] = _mm_packs_epi32(u[14], u[15]);
michael@0 2562 }
michael@0 2563 }
michael@0 2564 #endif
michael@0 2565 // Transpose the results, do it as four 8x8 transposes.
michael@0 2566 {
michael@0 2567 int transpose_block;
michael@0 2568 int16_t *output;
michael@0 2569 if (0 == pass) {
michael@0 2570 output = &intermediate[column_start * 32];
michael@0 2571 } else {
michael@0 2572 output = &output_org[column_start * 32];
michael@0 2573 }
michael@0 2574 for (transpose_block = 0; transpose_block < 4; ++transpose_block) {
michael@0 2575 __m128i *this_out = &out[8 * transpose_block];
michael@0 2576 // 00 01 02 03 04 05 06 07
michael@0 2577 // 10 11 12 13 14 15 16 17
michael@0 2578 // 20 21 22 23 24 25 26 27
michael@0 2579 // 30 31 32 33 34 35 36 37
michael@0 2580 // 40 41 42 43 44 45 46 47
michael@0 2581 // 50 51 52 53 54 55 56 57
michael@0 2582 // 60 61 62 63 64 65 66 67
michael@0 2583 // 70 71 72 73 74 75 76 77
michael@0 2584 const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]);
michael@0 2585 const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]);
michael@0 2586 const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]);
michael@0 2587 const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]);
michael@0 2588 const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]);
michael@0 2589 const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]);
michael@0 2590 const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]);
michael@0 2591 const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]);
michael@0 2592 // 00 10 01 11 02 12 03 13
michael@0 2593 // 20 30 21 31 22 32 23 33
michael@0 2594 // 04 14 05 15 06 16 07 17
michael@0 2595 // 24 34 25 35 26 36 27 37
michael@0 2596 // 40 50 41 51 42 52 43 53
michael@0 2597 // 60 70 61 71 62 72 63 73
michael@0 2598 // 54 54 55 55 56 56 57 57
michael@0 2599 // 64 74 65 75 66 76 67 77
michael@0 2600 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
michael@0 2601 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
michael@0 2602 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
michael@0 2603 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
michael@0 2604 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
michael@0 2605 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
michael@0 2606 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
michael@0 2607 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
michael@0 2608 // 00 10 20 30 01 11 21 31
michael@0 2609 // 40 50 60 70 41 51 61 71
michael@0 2610 // 02 12 22 32 03 13 23 33
michael@0 2611 // 42 52 62 72 43 53 63 73
michael@0 2612 // 04 14 24 34 05 15 21 36
michael@0 2613 // 44 54 64 74 45 55 61 76
michael@0 2614 // 06 16 26 36 07 17 27 37
michael@0 2615 // 46 56 66 76 47 57 67 77
michael@0 2616 __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
michael@0 2617 __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
michael@0 2618 __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
michael@0 2619 __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
michael@0 2620 __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
michael@0 2621 __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
michael@0 2622 __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
michael@0 2623 __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
michael@0 2624 // 00 10 20 30 40 50 60 70
michael@0 2625 // 01 11 21 31 41 51 61 71
michael@0 2626 // 02 12 22 32 42 52 62 72
michael@0 2627 // 03 13 23 33 43 53 63 73
michael@0 2628 // 04 14 24 34 44 54 64 74
michael@0 2629 // 05 15 25 35 45 55 65 75
michael@0 2630 // 06 16 26 36 46 56 66 76
michael@0 2631 // 07 17 27 37 47 57 67 77
michael@0 2632 if (0 == pass) {
michael@0 2633 // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2;
michael@0 2634 // TODO(cd): see quality impact of only doing
michael@0 2635 // output[j] = (output[j] + 1) >> 2;
michael@0 2636 // which would remove the code between here ...
michael@0 2637 __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero);
michael@0 2638 __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero);
michael@0 2639 __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero);
michael@0 2640 __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero);
michael@0 2641 __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero);
michael@0 2642 __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero);
michael@0 2643 __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero);
michael@0 2644 __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero);
michael@0 2645 tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0);
michael@0 2646 tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0);
michael@0 2647 tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0);
michael@0 2648 tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0);
michael@0 2649 tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0);
michael@0 2650 tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0);
michael@0 2651 tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0);
michael@0 2652 tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0);
michael@0 2653 // ... and here.
michael@0 2654 // PS: also change code in vp9/encoder/vp9_dct.c
michael@0 2655 tr2_0 = _mm_add_epi16(tr2_0, kOne);
michael@0 2656 tr2_1 = _mm_add_epi16(tr2_1, kOne);
michael@0 2657 tr2_2 = _mm_add_epi16(tr2_2, kOne);
michael@0 2658 tr2_3 = _mm_add_epi16(tr2_3, kOne);
michael@0 2659 tr2_4 = _mm_add_epi16(tr2_4, kOne);
michael@0 2660 tr2_5 = _mm_add_epi16(tr2_5, kOne);
michael@0 2661 tr2_6 = _mm_add_epi16(tr2_6, kOne);
michael@0 2662 tr2_7 = _mm_add_epi16(tr2_7, kOne);
michael@0 2663 tr2_0 = _mm_srai_epi16(tr2_0, 2);
michael@0 2664 tr2_1 = _mm_srai_epi16(tr2_1, 2);
michael@0 2665 tr2_2 = _mm_srai_epi16(tr2_2, 2);
michael@0 2666 tr2_3 = _mm_srai_epi16(tr2_3, 2);
michael@0 2667 tr2_4 = _mm_srai_epi16(tr2_4, 2);
michael@0 2668 tr2_5 = _mm_srai_epi16(tr2_5, 2);
michael@0 2669 tr2_6 = _mm_srai_epi16(tr2_6, 2);
michael@0 2670 tr2_7 = _mm_srai_epi16(tr2_7, 2);
michael@0 2671 }
michael@0 2672 // Note: even though all these stores are aligned, using the aligned
michael@0 2673 // intrinsic make the code slightly slower.
michael@0 2674 _mm_storeu_si128((__m128i *)(output + 0 * 32), tr2_0);
michael@0 2675 _mm_storeu_si128((__m128i *)(output + 1 * 32), tr2_1);
michael@0 2676 _mm_storeu_si128((__m128i *)(output + 2 * 32), tr2_2);
michael@0 2677 _mm_storeu_si128((__m128i *)(output + 3 * 32), tr2_3);
michael@0 2678 _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4);
michael@0 2679 _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5);
michael@0 2680 _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6);
michael@0 2681 _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7);
michael@0 2682 // Process next 8x8
michael@0 2683 output += 8;
michael@0 2684 }
michael@0 2685 }
michael@0 2686 }
michael@0 2687 }
michael@0 2688 } // NOLINT

mercurial