Thu, 15 Jan 2015 15:59:08 +0100
Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
michael@0 | 1 | /* |
michael@0 | 2 | * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
michael@0 | 3 | * |
michael@0 | 4 | * Use of this source code is governed by a BSD-style license |
michael@0 | 5 | * that can be found in the LICENSE file in the root of the source |
michael@0 | 6 | * tree. An additional intellectual property rights grant can be found |
michael@0 | 7 | * in the file PATENTS. All contributing project authors may |
michael@0 | 8 | * be found in the AUTHORS file in the root of the source tree. |
michael@0 | 9 | */ |
michael@0 | 10 | |
michael@0 | 11 | #include <emmintrin.h> // SSE2 |
michael@0 | 12 | #include "vp9/common/vp9_idct.h" // for cospi constants |
michael@0 | 13 | #include "vpx_ports/mem.h" |
michael@0 | 14 | |
michael@0 | 15 | #if FDCT32x32_HIGH_PRECISION |
michael@0 | 16 | static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) { |
michael@0 | 17 | __m128i buf0, buf1; |
michael@0 | 18 | buf0 = _mm_mul_epu32(a, b); |
michael@0 | 19 | a = _mm_srli_epi64(a, 32); |
michael@0 | 20 | b = _mm_srli_epi64(b, 32); |
michael@0 | 21 | buf1 = _mm_mul_epu32(a, b); |
michael@0 | 22 | return _mm_add_epi64(buf0, buf1); |
michael@0 | 23 | } |
michael@0 | 24 | |
michael@0 | 25 | static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) { |
michael@0 | 26 | __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); |
michael@0 | 27 | __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); |
michael@0 | 28 | return _mm_unpacklo_epi64(buf0, buf1); |
michael@0 | 29 | } |
michael@0 | 30 | #endif |
michael@0 | 31 | |
michael@0 | 32 | void FDCT32x32_2D(const int16_t *input, |
michael@0 | 33 | int16_t *output_org, int stride) { |
michael@0 | 34 | // Calculate pre-multiplied strides |
michael@0 | 35 | const int str1 = stride; |
michael@0 | 36 | const int str2 = 2 * stride; |
michael@0 | 37 | const int str3 = 2 * stride + str1; |
michael@0 | 38 | // We need an intermediate buffer between passes. |
michael@0 | 39 | DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); |
michael@0 | 40 | // Constants |
michael@0 | 41 | // When we use them, in one case, they are all the same. In all others |
michael@0 | 42 | // it's a pair of them that we need to repeat four times. This is done |
michael@0 | 43 | // by constructing the 32 bit constant corresponding to that pair. |
michael@0 | 44 | const __m128i k__cospi_p16_p16 = _mm_set1_epi16(+cospi_16_64); |
michael@0 | 45 | const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64); |
michael@0 | 46 | const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
michael@0 | 47 | const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
michael@0 | 48 | const __m128i k__cospi_p24_p08 = pair_set_epi16(+cospi_24_64, cospi_8_64); |
michael@0 | 49 | const __m128i k__cospi_p12_p20 = pair_set_epi16(+cospi_12_64, cospi_20_64); |
michael@0 | 50 | const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
michael@0 | 51 | const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
michael@0 | 52 | const __m128i k__cospi_p28_p04 = pair_set_epi16(+cospi_28_64, cospi_4_64); |
michael@0 | 53 | const __m128i k__cospi_m28_m04 = pair_set_epi16(-cospi_28_64, -cospi_4_64); |
michael@0 | 54 | const __m128i k__cospi_m12_m20 = pair_set_epi16(-cospi_12_64, -cospi_20_64); |
michael@0 | 55 | const __m128i k__cospi_p30_p02 = pair_set_epi16(+cospi_30_64, cospi_2_64); |
michael@0 | 56 | const __m128i k__cospi_p14_p18 = pair_set_epi16(+cospi_14_64, cospi_18_64); |
michael@0 | 57 | const __m128i k__cospi_p22_p10 = pair_set_epi16(+cospi_22_64, cospi_10_64); |
michael@0 | 58 | const __m128i k__cospi_p06_p26 = pair_set_epi16(+cospi_6_64, cospi_26_64); |
michael@0 | 59 | const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); |
michael@0 | 60 | const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); |
michael@0 | 61 | const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); |
michael@0 | 62 | const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); |
michael@0 | 63 | const __m128i k__cospi_p31_p01 = pair_set_epi16(+cospi_31_64, cospi_1_64); |
michael@0 | 64 | const __m128i k__cospi_p15_p17 = pair_set_epi16(+cospi_15_64, cospi_17_64); |
michael@0 | 65 | const __m128i k__cospi_p23_p09 = pair_set_epi16(+cospi_23_64, cospi_9_64); |
michael@0 | 66 | const __m128i k__cospi_p07_p25 = pair_set_epi16(+cospi_7_64, cospi_25_64); |
michael@0 | 67 | const __m128i k__cospi_m25_p07 = pair_set_epi16(-cospi_25_64, cospi_7_64); |
michael@0 | 68 | const __m128i k__cospi_m09_p23 = pair_set_epi16(-cospi_9_64, cospi_23_64); |
michael@0 | 69 | const __m128i k__cospi_m17_p15 = pair_set_epi16(-cospi_17_64, cospi_15_64); |
michael@0 | 70 | const __m128i k__cospi_m01_p31 = pair_set_epi16(-cospi_1_64, cospi_31_64); |
michael@0 | 71 | const __m128i k__cospi_p27_p05 = pair_set_epi16(+cospi_27_64, cospi_5_64); |
michael@0 | 72 | const __m128i k__cospi_p11_p21 = pair_set_epi16(+cospi_11_64, cospi_21_64); |
michael@0 | 73 | const __m128i k__cospi_p19_p13 = pair_set_epi16(+cospi_19_64, cospi_13_64); |
michael@0 | 74 | const __m128i k__cospi_p03_p29 = pair_set_epi16(+cospi_3_64, cospi_29_64); |
michael@0 | 75 | const __m128i k__cospi_m29_p03 = pair_set_epi16(-cospi_29_64, cospi_3_64); |
michael@0 | 76 | const __m128i k__cospi_m13_p19 = pair_set_epi16(-cospi_13_64, cospi_19_64); |
michael@0 | 77 | const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64); |
michael@0 | 78 | const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64); |
michael@0 | 79 | const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
michael@0 | 80 | const __m128i kZero = _mm_set1_epi16(0); |
michael@0 | 81 | const __m128i kOne = _mm_set1_epi16(1); |
michael@0 | 82 | // Do the two transform/transpose passes |
michael@0 | 83 | int pass; |
michael@0 | 84 | for (pass = 0; pass < 2; ++pass) { |
michael@0 | 85 | // We process eight columns (transposed rows in second pass) at a time. |
michael@0 | 86 | int column_start; |
michael@0 | 87 | for (column_start = 0; column_start < 32; column_start += 8) { |
michael@0 | 88 | __m128i step1[32]; |
michael@0 | 89 | __m128i step2[32]; |
michael@0 | 90 | __m128i step3[32]; |
michael@0 | 91 | __m128i out[32]; |
michael@0 | 92 | // Stage 1 |
michael@0 | 93 | // Note: even though all the loads below are aligned, using the aligned |
michael@0 | 94 | // intrinsic make the code slightly slower. |
michael@0 | 95 | if (0 == pass) { |
michael@0 | 96 | const int16_t *in = &input[column_start]; |
michael@0 | 97 | // step1[i] = (in[ 0 * stride] + in[(32 - 1) * stride]) << 2; |
michael@0 | 98 | // Note: the next four blocks could be in a loop. That would help the |
michael@0 | 99 | // instruction cache but is actually slower. |
michael@0 | 100 | { |
michael@0 | 101 | const int16_t *ina = in + 0 * str1; |
michael@0 | 102 | const int16_t *inb = in + 31 * str1; |
michael@0 | 103 | __m128i *step1a = &step1[ 0]; |
michael@0 | 104 | __m128i *step1b = &step1[31]; |
michael@0 | 105 | const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); |
michael@0 | 106 | const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); |
michael@0 | 107 | const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); |
michael@0 | 108 | const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); |
michael@0 | 109 | const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); |
michael@0 | 110 | const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); |
michael@0 | 111 | const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); |
michael@0 | 112 | const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); |
michael@0 | 113 | step1a[ 0] = _mm_add_epi16(ina0, inb0); |
michael@0 | 114 | step1a[ 1] = _mm_add_epi16(ina1, inb1); |
michael@0 | 115 | step1a[ 2] = _mm_add_epi16(ina2, inb2); |
michael@0 | 116 | step1a[ 3] = _mm_add_epi16(ina3, inb3); |
michael@0 | 117 | step1b[-3] = _mm_sub_epi16(ina3, inb3); |
michael@0 | 118 | step1b[-2] = _mm_sub_epi16(ina2, inb2); |
michael@0 | 119 | step1b[-1] = _mm_sub_epi16(ina1, inb1); |
michael@0 | 120 | step1b[-0] = _mm_sub_epi16(ina0, inb0); |
michael@0 | 121 | step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); |
michael@0 | 122 | step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); |
michael@0 | 123 | step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); |
michael@0 | 124 | step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); |
michael@0 | 125 | step1b[-3] = _mm_slli_epi16(step1b[-3], 2); |
michael@0 | 126 | step1b[-2] = _mm_slli_epi16(step1b[-2], 2); |
michael@0 | 127 | step1b[-1] = _mm_slli_epi16(step1b[-1], 2); |
michael@0 | 128 | step1b[-0] = _mm_slli_epi16(step1b[-0], 2); |
michael@0 | 129 | } |
michael@0 | 130 | { |
michael@0 | 131 | const int16_t *ina = in + 4 * str1; |
michael@0 | 132 | const int16_t *inb = in + 27 * str1; |
michael@0 | 133 | __m128i *step1a = &step1[ 4]; |
michael@0 | 134 | __m128i *step1b = &step1[27]; |
michael@0 | 135 | const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); |
michael@0 | 136 | const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); |
michael@0 | 137 | const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); |
michael@0 | 138 | const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); |
michael@0 | 139 | const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); |
michael@0 | 140 | const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); |
michael@0 | 141 | const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); |
michael@0 | 142 | const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); |
michael@0 | 143 | step1a[ 0] = _mm_add_epi16(ina0, inb0); |
michael@0 | 144 | step1a[ 1] = _mm_add_epi16(ina1, inb1); |
michael@0 | 145 | step1a[ 2] = _mm_add_epi16(ina2, inb2); |
michael@0 | 146 | step1a[ 3] = _mm_add_epi16(ina3, inb3); |
michael@0 | 147 | step1b[-3] = _mm_sub_epi16(ina3, inb3); |
michael@0 | 148 | step1b[-2] = _mm_sub_epi16(ina2, inb2); |
michael@0 | 149 | step1b[-1] = _mm_sub_epi16(ina1, inb1); |
michael@0 | 150 | step1b[-0] = _mm_sub_epi16(ina0, inb0); |
michael@0 | 151 | step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); |
michael@0 | 152 | step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); |
michael@0 | 153 | step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); |
michael@0 | 154 | step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); |
michael@0 | 155 | step1b[-3] = _mm_slli_epi16(step1b[-3], 2); |
michael@0 | 156 | step1b[-2] = _mm_slli_epi16(step1b[-2], 2); |
michael@0 | 157 | step1b[-1] = _mm_slli_epi16(step1b[-1], 2); |
michael@0 | 158 | step1b[-0] = _mm_slli_epi16(step1b[-0], 2); |
michael@0 | 159 | } |
michael@0 | 160 | { |
michael@0 | 161 | const int16_t *ina = in + 8 * str1; |
michael@0 | 162 | const int16_t *inb = in + 23 * str1; |
michael@0 | 163 | __m128i *step1a = &step1[ 8]; |
michael@0 | 164 | __m128i *step1b = &step1[23]; |
michael@0 | 165 | const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); |
michael@0 | 166 | const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); |
michael@0 | 167 | const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); |
michael@0 | 168 | const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); |
michael@0 | 169 | const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); |
michael@0 | 170 | const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); |
michael@0 | 171 | const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); |
michael@0 | 172 | const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); |
michael@0 | 173 | step1a[ 0] = _mm_add_epi16(ina0, inb0); |
michael@0 | 174 | step1a[ 1] = _mm_add_epi16(ina1, inb1); |
michael@0 | 175 | step1a[ 2] = _mm_add_epi16(ina2, inb2); |
michael@0 | 176 | step1a[ 3] = _mm_add_epi16(ina3, inb3); |
michael@0 | 177 | step1b[-3] = _mm_sub_epi16(ina3, inb3); |
michael@0 | 178 | step1b[-2] = _mm_sub_epi16(ina2, inb2); |
michael@0 | 179 | step1b[-1] = _mm_sub_epi16(ina1, inb1); |
michael@0 | 180 | step1b[-0] = _mm_sub_epi16(ina0, inb0); |
michael@0 | 181 | step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); |
michael@0 | 182 | step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); |
michael@0 | 183 | step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); |
michael@0 | 184 | step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); |
michael@0 | 185 | step1b[-3] = _mm_slli_epi16(step1b[-3], 2); |
michael@0 | 186 | step1b[-2] = _mm_slli_epi16(step1b[-2], 2); |
michael@0 | 187 | step1b[-1] = _mm_slli_epi16(step1b[-1], 2); |
michael@0 | 188 | step1b[-0] = _mm_slli_epi16(step1b[-0], 2); |
michael@0 | 189 | } |
michael@0 | 190 | { |
michael@0 | 191 | const int16_t *ina = in + 12 * str1; |
michael@0 | 192 | const int16_t *inb = in + 19 * str1; |
michael@0 | 193 | __m128i *step1a = &step1[12]; |
michael@0 | 194 | __m128i *step1b = &step1[19]; |
michael@0 | 195 | const __m128i ina0 = _mm_loadu_si128((const __m128i *)(ina)); |
michael@0 | 196 | const __m128i ina1 = _mm_loadu_si128((const __m128i *)(ina + str1)); |
michael@0 | 197 | const __m128i ina2 = _mm_loadu_si128((const __m128i *)(ina + str2)); |
michael@0 | 198 | const __m128i ina3 = _mm_loadu_si128((const __m128i *)(ina + str3)); |
michael@0 | 199 | const __m128i inb3 = _mm_loadu_si128((const __m128i *)(inb - str3)); |
michael@0 | 200 | const __m128i inb2 = _mm_loadu_si128((const __m128i *)(inb - str2)); |
michael@0 | 201 | const __m128i inb1 = _mm_loadu_si128((const __m128i *)(inb - str1)); |
michael@0 | 202 | const __m128i inb0 = _mm_loadu_si128((const __m128i *)(inb)); |
michael@0 | 203 | step1a[ 0] = _mm_add_epi16(ina0, inb0); |
michael@0 | 204 | step1a[ 1] = _mm_add_epi16(ina1, inb1); |
michael@0 | 205 | step1a[ 2] = _mm_add_epi16(ina2, inb2); |
michael@0 | 206 | step1a[ 3] = _mm_add_epi16(ina3, inb3); |
michael@0 | 207 | step1b[-3] = _mm_sub_epi16(ina3, inb3); |
michael@0 | 208 | step1b[-2] = _mm_sub_epi16(ina2, inb2); |
michael@0 | 209 | step1b[-1] = _mm_sub_epi16(ina1, inb1); |
michael@0 | 210 | step1b[-0] = _mm_sub_epi16(ina0, inb0); |
michael@0 | 211 | step1a[ 0] = _mm_slli_epi16(step1a[ 0], 2); |
michael@0 | 212 | step1a[ 1] = _mm_slli_epi16(step1a[ 1], 2); |
michael@0 | 213 | step1a[ 2] = _mm_slli_epi16(step1a[ 2], 2); |
michael@0 | 214 | step1a[ 3] = _mm_slli_epi16(step1a[ 3], 2); |
michael@0 | 215 | step1b[-3] = _mm_slli_epi16(step1b[-3], 2); |
michael@0 | 216 | step1b[-2] = _mm_slli_epi16(step1b[-2], 2); |
michael@0 | 217 | step1b[-1] = _mm_slli_epi16(step1b[-1], 2); |
michael@0 | 218 | step1b[-0] = _mm_slli_epi16(step1b[-0], 2); |
michael@0 | 219 | } |
michael@0 | 220 | } else { |
michael@0 | 221 | int16_t *in = &intermediate[column_start]; |
michael@0 | 222 | // step1[i] = in[ 0 * 32] + in[(32 - 1) * 32]; |
michael@0 | 223 | // Note: using the same approach as above to have common offset is |
michael@0 | 224 | // counter-productive as all offsets can be calculated at compile |
michael@0 | 225 | // time. |
michael@0 | 226 | // Note: the next four blocks could be in a loop. That would help the |
michael@0 | 227 | // instruction cache but is actually slower. |
michael@0 | 228 | { |
michael@0 | 229 | __m128i in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 32)); |
michael@0 | 230 | __m128i in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 32)); |
michael@0 | 231 | __m128i in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 32)); |
michael@0 | 232 | __m128i in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 32)); |
michael@0 | 233 | __m128i in28 = _mm_loadu_si128((const __m128i *)(in + 28 * 32)); |
michael@0 | 234 | __m128i in29 = _mm_loadu_si128((const __m128i *)(in + 29 * 32)); |
michael@0 | 235 | __m128i in30 = _mm_loadu_si128((const __m128i *)(in + 30 * 32)); |
michael@0 | 236 | __m128i in31 = _mm_loadu_si128((const __m128i *)(in + 31 * 32)); |
michael@0 | 237 | step1[ 0] = _mm_add_epi16(in00, in31); |
michael@0 | 238 | step1[ 1] = _mm_add_epi16(in01, in30); |
michael@0 | 239 | step1[ 2] = _mm_add_epi16(in02, in29); |
michael@0 | 240 | step1[ 3] = _mm_add_epi16(in03, in28); |
michael@0 | 241 | step1[28] = _mm_sub_epi16(in03, in28); |
michael@0 | 242 | step1[29] = _mm_sub_epi16(in02, in29); |
michael@0 | 243 | step1[30] = _mm_sub_epi16(in01, in30); |
michael@0 | 244 | step1[31] = _mm_sub_epi16(in00, in31); |
michael@0 | 245 | } |
michael@0 | 246 | { |
michael@0 | 247 | __m128i in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 32)); |
michael@0 | 248 | __m128i in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 32)); |
michael@0 | 249 | __m128i in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 32)); |
michael@0 | 250 | __m128i in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 32)); |
michael@0 | 251 | __m128i in24 = _mm_loadu_si128((const __m128i *)(in + 24 * 32)); |
michael@0 | 252 | __m128i in25 = _mm_loadu_si128((const __m128i *)(in + 25 * 32)); |
michael@0 | 253 | __m128i in26 = _mm_loadu_si128((const __m128i *)(in + 26 * 32)); |
michael@0 | 254 | __m128i in27 = _mm_loadu_si128((const __m128i *)(in + 27 * 32)); |
michael@0 | 255 | step1[ 4] = _mm_add_epi16(in04, in27); |
michael@0 | 256 | step1[ 5] = _mm_add_epi16(in05, in26); |
michael@0 | 257 | step1[ 6] = _mm_add_epi16(in06, in25); |
michael@0 | 258 | step1[ 7] = _mm_add_epi16(in07, in24); |
michael@0 | 259 | step1[24] = _mm_sub_epi16(in07, in24); |
michael@0 | 260 | step1[25] = _mm_sub_epi16(in06, in25); |
michael@0 | 261 | step1[26] = _mm_sub_epi16(in05, in26); |
michael@0 | 262 | step1[27] = _mm_sub_epi16(in04, in27); |
michael@0 | 263 | } |
michael@0 | 264 | { |
michael@0 | 265 | __m128i in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 32)); |
michael@0 | 266 | __m128i in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 32)); |
michael@0 | 267 | __m128i in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 32)); |
michael@0 | 268 | __m128i in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 32)); |
michael@0 | 269 | __m128i in20 = _mm_loadu_si128((const __m128i *)(in + 20 * 32)); |
michael@0 | 270 | __m128i in21 = _mm_loadu_si128((const __m128i *)(in + 21 * 32)); |
michael@0 | 271 | __m128i in22 = _mm_loadu_si128((const __m128i *)(in + 22 * 32)); |
michael@0 | 272 | __m128i in23 = _mm_loadu_si128((const __m128i *)(in + 23 * 32)); |
michael@0 | 273 | step1[ 8] = _mm_add_epi16(in08, in23); |
michael@0 | 274 | step1[ 9] = _mm_add_epi16(in09, in22); |
michael@0 | 275 | step1[10] = _mm_add_epi16(in10, in21); |
michael@0 | 276 | step1[11] = _mm_add_epi16(in11, in20); |
michael@0 | 277 | step1[20] = _mm_sub_epi16(in11, in20); |
michael@0 | 278 | step1[21] = _mm_sub_epi16(in10, in21); |
michael@0 | 279 | step1[22] = _mm_sub_epi16(in09, in22); |
michael@0 | 280 | step1[23] = _mm_sub_epi16(in08, in23); |
michael@0 | 281 | } |
michael@0 | 282 | { |
michael@0 | 283 | __m128i in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 32)); |
michael@0 | 284 | __m128i in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 32)); |
michael@0 | 285 | __m128i in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 32)); |
michael@0 | 286 | __m128i in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 32)); |
michael@0 | 287 | __m128i in16 = _mm_loadu_si128((const __m128i *)(in + 16 * 32)); |
michael@0 | 288 | __m128i in17 = _mm_loadu_si128((const __m128i *)(in + 17 * 32)); |
michael@0 | 289 | __m128i in18 = _mm_loadu_si128((const __m128i *)(in + 18 * 32)); |
michael@0 | 290 | __m128i in19 = _mm_loadu_si128((const __m128i *)(in + 19 * 32)); |
michael@0 | 291 | step1[12] = _mm_add_epi16(in12, in19); |
michael@0 | 292 | step1[13] = _mm_add_epi16(in13, in18); |
michael@0 | 293 | step1[14] = _mm_add_epi16(in14, in17); |
michael@0 | 294 | step1[15] = _mm_add_epi16(in15, in16); |
michael@0 | 295 | step1[16] = _mm_sub_epi16(in15, in16); |
michael@0 | 296 | step1[17] = _mm_sub_epi16(in14, in17); |
michael@0 | 297 | step1[18] = _mm_sub_epi16(in13, in18); |
michael@0 | 298 | step1[19] = _mm_sub_epi16(in12, in19); |
michael@0 | 299 | } |
michael@0 | 300 | } |
michael@0 | 301 | // Stage 2 |
michael@0 | 302 | { |
michael@0 | 303 | step2[ 0] = _mm_add_epi16(step1[0], step1[15]); |
michael@0 | 304 | step2[ 1] = _mm_add_epi16(step1[1], step1[14]); |
michael@0 | 305 | step2[ 2] = _mm_add_epi16(step1[2], step1[13]); |
michael@0 | 306 | step2[ 3] = _mm_add_epi16(step1[3], step1[12]); |
michael@0 | 307 | step2[ 4] = _mm_add_epi16(step1[4], step1[11]); |
michael@0 | 308 | step2[ 5] = _mm_add_epi16(step1[5], step1[10]); |
michael@0 | 309 | step2[ 6] = _mm_add_epi16(step1[6], step1[ 9]); |
michael@0 | 310 | step2[ 7] = _mm_add_epi16(step1[7], step1[ 8]); |
michael@0 | 311 | step2[ 8] = _mm_sub_epi16(step1[7], step1[ 8]); |
michael@0 | 312 | step2[ 9] = _mm_sub_epi16(step1[6], step1[ 9]); |
michael@0 | 313 | step2[10] = _mm_sub_epi16(step1[5], step1[10]); |
michael@0 | 314 | step2[11] = _mm_sub_epi16(step1[4], step1[11]); |
michael@0 | 315 | step2[12] = _mm_sub_epi16(step1[3], step1[12]); |
michael@0 | 316 | step2[13] = _mm_sub_epi16(step1[2], step1[13]); |
michael@0 | 317 | step2[14] = _mm_sub_epi16(step1[1], step1[14]); |
michael@0 | 318 | step2[15] = _mm_sub_epi16(step1[0], step1[15]); |
michael@0 | 319 | } |
michael@0 | 320 | { |
michael@0 | 321 | const __m128i s2_20_0 = _mm_unpacklo_epi16(step1[27], step1[20]); |
michael@0 | 322 | const __m128i s2_20_1 = _mm_unpackhi_epi16(step1[27], step1[20]); |
michael@0 | 323 | const __m128i s2_21_0 = _mm_unpacklo_epi16(step1[26], step1[21]); |
michael@0 | 324 | const __m128i s2_21_1 = _mm_unpackhi_epi16(step1[26], step1[21]); |
michael@0 | 325 | const __m128i s2_22_0 = _mm_unpacklo_epi16(step1[25], step1[22]); |
michael@0 | 326 | const __m128i s2_22_1 = _mm_unpackhi_epi16(step1[25], step1[22]); |
michael@0 | 327 | const __m128i s2_23_0 = _mm_unpacklo_epi16(step1[24], step1[23]); |
michael@0 | 328 | const __m128i s2_23_1 = _mm_unpackhi_epi16(step1[24], step1[23]); |
michael@0 | 329 | const __m128i s2_20_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_m16); |
michael@0 | 330 | const __m128i s2_20_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_m16); |
michael@0 | 331 | const __m128i s2_21_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_m16); |
michael@0 | 332 | const __m128i s2_21_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_m16); |
michael@0 | 333 | const __m128i s2_22_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_m16); |
michael@0 | 334 | const __m128i s2_22_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_m16); |
michael@0 | 335 | const __m128i s2_23_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_m16); |
michael@0 | 336 | const __m128i s2_23_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_m16); |
michael@0 | 337 | const __m128i s2_24_2 = _mm_madd_epi16(s2_23_0, k__cospi_p16_p16); |
michael@0 | 338 | const __m128i s2_24_3 = _mm_madd_epi16(s2_23_1, k__cospi_p16_p16); |
michael@0 | 339 | const __m128i s2_25_2 = _mm_madd_epi16(s2_22_0, k__cospi_p16_p16); |
michael@0 | 340 | const __m128i s2_25_3 = _mm_madd_epi16(s2_22_1, k__cospi_p16_p16); |
michael@0 | 341 | const __m128i s2_26_2 = _mm_madd_epi16(s2_21_0, k__cospi_p16_p16); |
michael@0 | 342 | const __m128i s2_26_3 = _mm_madd_epi16(s2_21_1, k__cospi_p16_p16); |
michael@0 | 343 | const __m128i s2_27_2 = _mm_madd_epi16(s2_20_0, k__cospi_p16_p16); |
michael@0 | 344 | const __m128i s2_27_3 = _mm_madd_epi16(s2_20_1, k__cospi_p16_p16); |
michael@0 | 345 | // dct_const_round_shift |
michael@0 | 346 | const __m128i s2_20_4 = _mm_add_epi32(s2_20_2, k__DCT_CONST_ROUNDING); |
michael@0 | 347 | const __m128i s2_20_5 = _mm_add_epi32(s2_20_3, k__DCT_CONST_ROUNDING); |
michael@0 | 348 | const __m128i s2_21_4 = _mm_add_epi32(s2_21_2, k__DCT_CONST_ROUNDING); |
michael@0 | 349 | const __m128i s2_21_5 = _mm_add_epi32(s2_21_3, k__DCT_CONST_ROUNDING); |
michael@0 | 350 | const __m128i s2_22_4 = _mm_add_epi32(s2_22_2, k__DCT_CONST_ROUNDING); |
michael@0 | 351 | const __m128i s2_22_5 = _mm_add_epi32(s2_22_3, k__DCT_CONST_ROUNDING); |
michael@0 | 352 | const __m128i s2_23_4 = _mm_add_epi32(s2_23_2, k__DCT_CONST_ROUNDING); |
michael@0 | 353 | const __m128i s2_23_5 = _mm_add_epi32(s2_23_3, k__DCT_CONST_ROUNDING); |
michael@0 | 354 | const __m128i s2_24_4 = _mm_add_epi32(s2_24_2, k__DCT_CONST_ROUNDING); |
michael@0 | 355 | const __m128i s2_24_5 = _mm_add_epi32(s2_24_3, k__DCT_CONST_ROUNDING); |
michael@0 | 356 | const __m128i s2_25_4 = _mm_add_epi32(s2_25_2, k__DCT_CONST_ROUNDING); |
michael@0 | 357 | const __m128i s2_25_5 = _mm_add_epi32(s2_25_3, k__DCT_CONST_ROUNDING); |
michael@0 | 358 | const __m128i s2_26_4 = _mm_add_epi32(s2_26_2, k__DCT_CONST_ROUNDING); |
michael@0 | 359 | const __m128i s2_26_5 = _mm_add_epi32(s2_26_3, k__DCT_CONST_ROUNDING); |
michael@0 | 360 | const __m128i s2_27_4 = _mm_add_epi32(s2_27_2, k__DCT_CONST_ROUNDING); |
michael@0 | 361 | const __m128i s2_27_5 = _mm_add_epi32(s2_27_3, k__DCT_CONST_ROUNDING); |
michael@0 | 362 | const __m128i s2_20_6 = _mm_srai_epi32(s2_20_4, DCT_CONST_BITS); |
michael@0 | 363 | const __m128i s2_20_7 = _mm_srai_epi32(s2_20_5, DCT_CONST_BITS); |
michael@0 | 364 | const __m128i s2_21_6 = _mm_srai_epi32(s2_21_4, DCT_CONST_BITS); |
michael@0 | 365 | const __m128i s2_21_7 = _mm_srai_epi32(s2_21_5, DCT_CONST_BITS); |
michael@0 | 366 | const __m128i s2_22_6 = _mm_srai_epi32(s2_22_4, DCT_CONST_BITS); |
michael@0 | 367 | const __m128i s2_22_7 = _mm_srai_epi32(s2_22_5, DCT_CONST_BITS); |
michael@0 | 368 | const __m128i s2_23_6 = _mm_srai_epi32(s2_23_4, DCT_CONST_BITS); |
michael@0 | 369 | const __m128i s2_23_7 = _mm_srai_epi32(s2_23_5, DCT_CONST_BITS); |
michael@0 | 370 | const __m128i s2_24_6 = _mm_srai_epi32(s2_24_4, DCT_CONST_BITS); |
michael@0 | 371 | const __m128i s2_24_7 = _mm_srai_epi32(s2_24_5, DCT_CONST_BITS); |
michael@0 | 372 | const __m128i s2_25_6 = _mm_srai_epi32(s2_25_4, DCT_CONST_BITS); |
michael@0 | 373 | const __m128i s2_25_7 = _mm_srai_epi32(s2_25_5, DCT_CONST_BITS); |
michael@0 | 374 | const __m128i s2_26_6 = _mm_srai_epi32(s2_26_4, DCT_CONST_BITS); |
michael@0 | 375 | const __m128i s2_26_7 = _mm_srai_epi32(s2_26_5, DCT_CONST_BITS); |
michael@0 | 376 | const __m128i s2_27_6 = _mm_srai_epi32(s2_27_4, DCT_CONST_BITS); |
michael@0 | 377 | const __m128i s2_27_7 = _mm_srai_epi32(s2_27_5, DCT_CONST_BITS); |
michael@0 | 378 | // Combine |
michael@0 | 379 | step2[20] = _mm_packs_epi32(s2_20_6, s2_20_7); |
michael@0 | 380 | step2[21] = _mm_packs_epi32(s2_21_6, s2_21_7); |
michael@0 | 381 | step2[22] = _mm_packs_epi32(s2_22_6, s2_22_7); |
michael@0 | 382 | step2[23] = _mm_packs_epi32(s2_23_6, s2_23_7); |
michael@0 | 383 | step2[24] = _mm_packs_epi32(s2_24_6, s2_24_7); |
michael@0 | 384 | step2[25] = _mm_packs_epi32(s2_25_6, s2_25_7); |
michael@0 | 385 | step2[26] = _mm_packs_epi32(s2_26_6, s2_26_7); |
michael@0 | 386 | step2[27] = _mm_packs_epi32(s2_27_6, s2_27_7); |
michael@0 | 387 | } |
michael@0 | 388 | |
michael@0 | 389 | #if !FDCT32x32_HIGH_PRECISION |
michael@0 | 390 | // dump the magnitude by half, hence the intermediate values are within |
michael@0 | 391 | // the range of 16 bits. |
michael@0 | 392 | if (1 == pass) { |
michael@0 | 393 | __m128i s3_00_0 = _mm_cmplt_epi16(step2[ 0], kZero); |
michael@0 | 394 | __m128i s3_01_0 = _mm_cmplt_epi16(step2[ 1], kZero); |
michael@0 | 395 | __m128i s3_02_0 = _mm_cmplt_epi16(step2[ 2], kZero); |
michael@0 | 396 | __m128i s3_03_0 = _mm_cmplt_epi16(step2[ 3], kZero); |
michael@0 | 397 | __m128i s3_04_0 = _mm_cmplt_epi16(step2[ 4], kZero); |
michael@0 | 398 | __m128i s3_05_0 = _mm_cmplt_epi16(step2[ 5], kZero); |
michael@0 | 399 | __m128i s3_06_0 = _mm_cmplt_epi16(step2[ 6], kZero); |
michael@0 | 400 | __m128i s3_07_0 = _mm_cmplt_epi16(step2[ 7], kZero); |
michael@0 | 401 | __m128i s2_08_0 = _mm_cmplt_epi16(step2[ 8], kZero); |
michael@0 | 402 | __m128i s2_09_0 = _mm_cmplt_epi16(step2[ 9], kZero); |
michael@0 | 403 | __m128i s3_10_0 = _mm_cmplt_epi16(step2[10], kZero); |
michael@0 | 404 | __m128i s3_11_0 = _mm_cmplt_epi16(step2[11], kZero); |
michael@0 | 405 | __m128i s3_12_0 = _mm_cmplt_epi16(step2[12], kZero); |
michael@0 | 406 | __m128i s3_13_0 = _mm_cmplt_epi16(step2[13], kZero); |
michael@0 | 407 | __m128i s2_14_0 = _mm_cmplt_epi16(step2[14], kZero); |
michael@0 | 408 | __m128i s2_15_0 = _mm_cmplt_epi16(step2[15], kZero); |
michael@0 | 409 | __m128i s3_16_0 = _mm_cmplt_epi16(step1[16], kZero); |
michael@0 | 410 | __m128i s3_17_0 = _mm_cmplt_epi16(step1[17], kZero); |
michael@0 | 411 | __m128i s3_18_0 = _mm_cmplt_epi16(step1[18], kZero); |
michael@0 | 412 | __m128i s3_19_0 = _mm_cmplt_epi16(step1[19], kZero); |
michael@0 | 413 | __m128i s3_20_0 = _mm_cmplt_epi16(step2[20], kZero); |
michael@0 | 414 | __m128i s3_21_0 = _mm_cmplt_epi16(step2[21], kZero); |
michael@0 | 415 | __m128i s3_22_0 = _mm_cmplt_epi16(step2[22], kZero); |
michael@0 | 416 | __m128i s3_23_0 = _mm_cmplt_epi16(step2[23], kZero); |
michael@0 | 417 | __m128i s3_24_0 = _mm_cmplt_epi16(step2[24], kZero); |
michael@0 | 418 | __m128i s3_25_0 = _mm_cmplt_epi16(step2[25], kZero); |
michael@0 | 419 | __m128i s3_26_0 = _mm_cmplt_epi16(step2[26], kZero); |
michael@0 | 420 | __m128i s3_27_0 = _mm_cmplt_epi16(step2[27], kZero); |
michael@0 | 421 | __m128i s3_28_0 = _mm_cmplt_epi16(step1[28], kZero); |
michael@0 | 422 | __m128i s3_29_0 = _mm_cmplt_epi16(step1[29], kZero); |
michael@0 | 423 | __m128i s3_30_0 = _mm_cmplt_epi16(step1[30], kZero); |
michael@0 | 424 | __m128i s3_31_0 = _mm_cmplt_epi16(step1[31], kZero); |
michael@0 | 425 | |
michael@0 | 426 | step2[ 0] = _mm_sub_epi16(step2[ 0], s3_00_0); |
michael@0 | 427 | step2[ 1] = _mm_sub_epi16(step2[ 1], s3_01_0); |
michael@0 | 428 | step2[ 2] = _mm_sub_epi16(step2[ 2], s3_02_0); |
michael@0 | 429 | step2[ 3] = _mm_sub_epi16(step2[ 3], s3_03_0); |
michael@0 | 430 | step2[ 4] = _mm_sub_epi16(step2[ 4], s3_04_0); |
michael@0 | 431 | step2[ 5] = _mm_sub_epi16(step2[ 5], s3_05_0); |
michael@0 | 432 | step2[ 6] = _mm_sub_epi16(step2[ 6], s3_06_0); |
michael@0 | 433 | step2[ 7] = _mm_sub_epi16(step2[ 7], s3_07_0); |
michael@0 | 434 | step2[ 8] = _mm_sub_epi16(step2[ 8], s2_08_0); |
michael@0 | 435 | step2[ 9] = _mm_sub_epi16(step2[ 9], s2_09_0); |
michael@0 | 436 | step2[10] = _mm_sub_epi16(step2[10], s3_10_0); |
michael@0 | 437 | step2[11] = _mm_sub_epi16(step2[11], s3_11_0); |
michael@0 | 438 | step2[12] = _mm_sub_epi16(step2[12], s3_12_0); |
michael@0 | 439 | step2[13] = _mm_sub_epi16(step2[13], s3_13_0); |
michael@0 | 440 | step2[14] = _mm_sub_epi16(step2[14], s2_14_0); |
michael@0 | 441 | step2[15] = _mm_sub_epi16(step2[15], s2_15_0); |
michael@0 | 442 | step1[16] = _mm_sub_epi16(step1[16], s3_16_0); |
michael@0 | 443 | step1[17] = _mm_sub_epi16(step1[17], s3_17_0); |
michael@0 | 444 | step1[18] = _mm_sub_epi16(step1[18], s3_18_0); |
michael@0 | 445 | step1[19] = _mm_sub_epi16(step1[19], s3_19_0); |
michael@0 | 446 | step2[20] = _mm_sub_epi16(step2[20], s3_20_0); |
michael@0 | 447 | step2[21] = _mm_sub_epi16(step2[21], s3_21_0); |
michael@0 | 448 | step2[22] = _mm_sub_epi16(step2[22], s3_22_0); |
michael@0 | 449 | step2[23] = _mm_sub_epi16(step2[23], s3_23_0); |
michael@0 | 450 | step2[24] = _mm_sub_epi16(step2[24], s3_24_0); |
michael@0 | 451 | step2[25] = _mm_sub_epi16(step2[25], s3_25_0); |
michael@0 | 452 | step2[26] = _mm_sub_epi16(step2[26], s3_26_0); |
michael@0 | 453 | step2[27] = _mm_sub_epi16(step2[27], s3_27_0); |
michael@0 | 454 | step1[28] = _mm_sub_epi16(step1[28], s3_28_0); |
michael@0 | 455 | step1[29] = _mm_sub_epi16(step1[29], s3_29_0); |
michael@0 | 456 | step1[30] = _mm_sub_epi16(step1[30], s3_30_0); |
michael@0 | 457 | step1[31] = _mm_sub_epi16(step1[31], s3_31_0); |
michael@0 | 458 | |
michael@0 | 459 | step2[ 0] = _mm_add_epi16(step2[ 0], kOne); |
michael@0 | 460 | step2[ 1] = _mm_add_epi16(step2[ 1], kOne); |
michael@0 | 461 | step2[ 2] = _mm_add_epi16(step2[ 2], kOne); |
michael@0 | 462 | step2[ 3] = _mm_add_epi16(step2[ 3], kOne); |
michael@0 | 463 | step2[ 4] = _mm_add_epi16(step2[ 4], kOne); |
michael@0 | 464 | step2[ 5] = _mm_add_epi16(step2[ 5], kOne); |
michael@0 | 465 | step2[ 6] = _mm_add_epi16(step2[ 6], kOne); |
michael@0 | 466 | step2[ 7] = _mm_add_epi16(step2[ 7], kOne); |
michael@0 | 467 | step2[ 8] = _mm_add_epi16(step2[ 8], kOne); |
michael@0 | 468 | step2[ 9] = _mm_add_epi16(step2[ 9], kOne); |
michael@0 | 469 | step2[10] = _mm_add_epi16(step2[10], kOne); |
michael@0 | 470 | step2[11] = _mm_add_epi16(step2[11], kOne); |
michael@0 | 471 | step2[12] = _mm_add_epi16(step2[12], kOne); |
michael@0 | 472 | step2[13] = _mm_add_epi16(step2[13], kOne); |
michael@0 | 473 | step2[14] = _mm_add_epi16(step2[14], kOne); |
michael@0 | 474 | step2[15] = _mm_add_epi16(step2[15], kOne); |
michael@0 | 475 | step1[16] = _mm_add_epi16(step1[16], kOne); |
michael@0 | 476 | step1[17] = _mm_add_epi16(step1[17], kOne); |
michael@0 | 477 | step1[18] = _mm_add_epi16(step1[18], kOne); |
michael@0 | 478 | step1[19] = _mm_add_epi16(step1[19], kOne); |
michael@0 | 479 | step2[20] = _mm_add_epi16(step2[20], kOne); |
michael@0 | 480 | step2[21] = _mm_add_epi16(step2[21], kOne); |
michael@0 | 481 | step2[22] = _mm_add_epi16(step2[22], kOne); |
michael@0 | 482 | step2[23] = _mm_add_epi16(step2[23], kOne); |
michael@0 | 483 | step2[24] = _mm_add_epi16(step2[24], kOne); |
michael@0 | 484 | step2[25] = _mm_add_epi16(step2[25], kOne); |
michael@0 | 485 | step2[26] = _mm_add_epi16(step2[26], kOne); |
michael@0 | 486 | step2[27] = _mm_add_epi16(step2[27], kOne); |
michael@0 | 487 | step1[28] = _mm_add_epi16(step1[28], kOne); |
michael@0 | 488 | step1[29] = _mm_add_epi16(step1[29], kOne); |
michael@0 | 489 | step1[30] = _mm_add_epi16(step1[30], kOne); |
michael@0 | 490 | step1[31] = _mm_add_epi16(step1[31], kOne); |
michael@0 | 491 | |
michael@0 | 492 | step2[ 0] = _mm_srai_epi16(step2[ 0], 2); |
michael@0 | 493 | step2[ 1] = _mm_srai_epi16(step2[ 1], 2); |
michael@0 | 494 | step2[ 2] = _mm_srai_epi16(step2[ 2], 2); |
michael@0 | 495 | step2[ 3] = _mm_srai_epi16(step2[ 3], 2); |
michael@0 | 496 | step2[ 4] = _mm_srai_epi16(step2[ 4], 2); |
michael@0 | 497 | step2[ 5] = _mm_srai_epi16(step2[ 5], 2); |
michael@0 | 498 | step2[ 6] = _mm_srai_epi16(step2[ 6], 2); |
michael@0 | 499 | step2[ 7] = _mm_srai_epi16(step2[ 7], 2); |
michael@0 | 500 | step2[ 8] = _mm_srai_epi16(step2[ 8], 2); |
michael@0 | 501 | step2[ 9] = _mm_srai_epi16(step2[ 9], 2); |
michael@0 | 502 | step2[10] = _mm_srai_epi16(step2[10], 2); |
michael@0 | 503 | step2[11] = _mm_srai_epi16(step2[11], 2); |
michael@0 | 504 | step2[12] = _mm_srai_epi16(step2[12], 2); |
michael@0 | 505 | step2[13] = _mm_srai_epi16(step2[13], 2); |
michael@0 | 506 | step2[14] = _mm_srai_epi16(step2[14], 2); |
michael@0 | 507 | step2[15] = _mm_srai_epi16(step2[15], 2); |
michael@0 | 508 | step1[16] = _mm_srai_epi16(step1[16], 2); |
michael@0 | 509 | step1[17] = _mm_srai_epi16(step1[17], 2); |
michael@0 | 510 | step1[18] = _mm_srai_epi16(step1[18], 2); |
michael@0 | 511 | step1[19] = _mm_srai_epi16(step1[19], 2); |
michael@0 | 512 | step2[20] = _mm_srai_epi16(step2[20], 2); |
michael@0 | 513 | step2[21] = _mm_srai_epi16(step2[21], 2); |
michael@0 | 514 | step2[22] = _mm_srai_epi16(step2[22], 2); |
michael@0 | 515 | step2[23] = _mm_srai_epi16(step2[23], 2); |
michael@0 | 516 | step2[24] = _mm_srai_epi16(step2[24], 2); |
michael@0 | 517 | step2[25] = _mm_srai_epi16(step2[25], 2); |
michael@0 | 518 | step2[26] = _mm_srai_epi16(step2[26], 2); |
michael@0 | 519 | step2[27] = _mm_srai_epi16(step2[27], 2); |
michael@0 | 520 | step1[28] = _mm_srai_epi16(step1[28], 2); |
michael@0 | 521 | step1[29] = _mm_srai_epi16(step1[29], 2); |
michael@0 | 522 | step1[30] = _mm_srai_epi16(step1[30], 2); |
michael@0 | 523 | step1[31] = _mm_srai_epi16(step1[31], 2); |
michael@0 | 524 | } |
michael@0 | 525 | #endif |
michael@0 | 526 | |
michael@0 | 527 | #if FDCT32x32_HIGH_PRECISION |
michael@0 | 528 | if (pass == 0) { |
michael@0 | 529 | #endif |
michael@0 | 530 | // Stage 3 |
michael@0 | 531 | { |
michael@0 | 532 | step3[0] = _mm_add_epi16(step2[(8 - 1)], step2[0]); |
michael@0 | 533 | step3[1] = _mm_add_epi16(step2[(8 - 2)], step2[1]); |
michael@0 | 534 | step3[2] = _mm_add_epi16(step2[(8 - 3)], step2[2]); |
michael@0 | 535 | step3[3] = _mm_add_epi16(step2[(8 - 4)], step2[3]); |
michael@0 | 536 | step3[4] = _mm_sub_epi16(step2[(8 - 5)], step2[4]); |
michael@0 | 537 | step3[5] = _mm_sub_epi16(step2[(8 - 6)], step2[5]); |
michael@0 | 538 | step3[6] = _mm_sub_epi16(step2[(8 - 7)], step2[6]); |
michael@0 | 539 | step3[7] = _mm_sub_epi16(step2[(8 - 8)], step2[7]); |
michael@0 | 540 | } |
michael@0 | 541 | { |
michael@0 | 542 | const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); |
michael@0 | 543 | const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); |
michael@0 | 544 | const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); |
michael@0 | 545 | const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); |
michael@0 | 546 | const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); |
michael@0 | 547 | const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); |
michael@0 | 548 | const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); |
michael@0 | 549 | const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); |
michael@0 | 550 | const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); |
michael@0 | 551 | const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); |
michael@0 | 552 | const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); |
michael@0 | 553 | const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); |
michael@0 | 554 | // dct_const_round_shift |
michael@0 | 555 | const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); |
michael@0 | 556 | const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); |
michael@0 | 557 | const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); |
michael@0 | 558 | const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); |
michael@0 | 559 | const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); |
michael@0 | 560 | const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); |
michael@0 | 561 | const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); |
michael@0 | 562 | const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); |
michael@0 | 563 | const __m128i s3_10_6 = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); |
michael@0 | 564 | const __m128i s3_10_7 = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); |
michael@0 | 565 | const __m128i s3_11_6 = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); |
michael@0 | 566 | const __m128i s3_11_7 = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); |
michael@0 | 567 | const __m128i s3_12_6 = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); |
michael@0 | 568 | const __m128i s3_12_7 = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); |
michael@0 | 569 | const __m128i s3_13_6 = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); |
michael@0 | 570 | const __m128i s3_13_7 = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); |
michael@0 | 571 | // Combine |
michael@0 | 572 | step3[10] = _mm_packs_epi32(s3_10_6, s3_10_7); |
michael@0 | 573 | step3[11] = _mm_packs_epi32(s3_11_6, s3_11_7); |
michael@0 | 574 | step3[12] = _mm_packs_epi32(s3_12_6, s3_12_7); |
michael@0 | 575 | step3[13] = _mm_packs_epi32(s3_13_6, s3_13_7); |
michael@0 | 576 | } |
michael@0 | 577 | { |
michael@0 | 578 | step3[16] = _mm_add_epi16(step2[23], step1[16]); |
michael@0 | 579 | step3[17] = _mm_add_epi16(step2[22], step1[17]); |
michael@0 | 580 | step3[18] = _mm_add_epi16(step2[21], step1[18]); |
michael@0 | 581 | step3[19] = _mm_add_epi16(step2[20], step1[19]); |
michael@0 | 582 | step3[20] = _mm_sub_epi16(step1[19], step2[20]); |
michael@0 | 583 | step3[21] = _mm_sub_epi16(step1[18], step2[21]); |
michael@0 | 584 | step3[22] = _mm_sub_epi16(step1[17], step2[22]); |
michael@0 | 585 | step3[23] = _mm_sub_epi16(step1[16], step2[23]); |
michael@0 | 586 | step3[24] = _mm_sub_epi16(step1[31], step2[24]); |
michael@0 | 587 | step3[25] = _mm_sub_epi16(step1[30], step2[25]); |
michael@0 | 588 | step3[26] = _mm_sub_epi16(step1[29], step2[26]); |
michael@0 | 589 | step3[27] = _mm_sub_epi16(step1[28], step2[27]); |
michael@0 | 590 | step3[28] = _mm_add_epi16(step2[27], step1[28]); |
michael@0 | 591 | step3[29] = _mm_add_epi16(step2[26], step1[29]); |
michael@0 | 592 | step3[30] = _mm_add_epi16(step2[25], step1[30]); |
michael@0 | 593 | step3[31] = _mm_add_epi16(step2[24], step1[31]); |
michael@0 | 594 | } |
michael@0 | 595 | |
michael@0 | 596 | // Stage 4 |
michael@0 | 597 | { |
michael@0 | 598 | step1[ 0] = _mm_add_epi16(step3[ 3], step3[ 0]); |
michael@0 | 599 | step1[ 1] = _mm_add_epi16(step3[ 2], step3[ 1]); |
michael@0 | 600 | step1[ 2] = _mm_sub_epi16(step3[ 1], step3[ 2]); |
michael@0 | 601 | step1[ 3] = _mm_sub_epi16(step3[ 0], step3[ 3]); |
michael@0 | 602 | step1[ 8] = _mm_add_epi16(step3[11], step2[ 8]); |
michael@0 | 603 | step1[ 9] = _mm_add_epi16(step3[10], step2[ 9]); |
michael@0 | 604 | step1[10] = _mm_sub_epi16(step2[ 9], step3[10]); |
michael@0 | 605 | step1[11] = _mm_sub_epi16(step2[ 8], step3[11]); |
michael@0 | 606 | step1[12] = _mm_sub_epi16(step2[15], step3[12]); |
michael@0 | 607 | step1[13] = _mm_sub_epi16(step2[14], step3[13]); |
michael@0 | 608 | step1[14] = _mm_add_epi16(step3[13], step2[14]); |
michael@0 | 609 | step1[15] = _mm_add_epi16(step3[12], step2[15]); |
michael@0 | 610 | } |
michael@0 | 611 | { |
michael@0 | 612 | const __m128i s1_05_0 = _mm_unpacklo_epi16(step3[6], step3[5]); |
michael@0 | 613 | const __m128i s1_05_1 = _mm_unpackhi_epi16(step3[6], step3[5]); |
michael@0 | 614 | const __m128i s1_05_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_m16); |
michael@0 | 615 | const __m128i s1_05_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_m16); |
michael@0 | 616 | const __m128i s1_06_2 = _mm_madd_epi16(s1_05_0, k__cospi_p16_p16); |
michael@0 | 617 | const __m128i s1_06_3 = _mm_madd_epi16(s1_05_1, k__cospi_p16_p16); |
michael@0 | 618 | // dct_const_round_shift |
michael@0 | 619 | const __m128i s1_05_4 = _mm_add_epi32(s1_05_2, k__DCT_CONST_ROUNDING); |
michael@0 | 620 | const __m128i s1_05_5 = _mm_add_epi32(s1_05_3, k__DCT_CONST_ROUNDING); |
michael@0 | 621 | const __m128i s1_06_4 = _mm_add_epi32(s1_06_2, k__DCT_CONST_ROUNDING); |
michael@0 | 622 | const __m128i s1_06_5 = _mm_add_epi32(s1_06_3, k__DCT_CONST_ROUNDING); |
michael@0 | 623 | const __m128i s1_05_6 = _mm_srai_epi32(s1_05_4, DCT_CONST_BITS); |
michael@0 | 624 | const __m128i s1_05_7 = _mm_srai_epi32(s1_05_5, DCT_CONST_BITS); |
michael@0 | 625 | const __m128i s1_06_6 = _mm_srai_epi32(s1_06_4, DCT_CONST_BITS); |
michael@0 | 626 | const __m128i s1_06_7 = _mm_srai_epi32(s1_06_5, DCT_CONST_BITS); |
michael@0 | 627 | // Combine |
michael@0 | 628 | step1[5] = _mm_packs_epi32(s1_05_6, s1_05_7); |
michael@0 | 629 | step1[6] = _mm_packs_epi32(s1_06_6, s1_06_7); |
michael@0 | 630 | } |
michael@0 | 631 | { |
michael@0 | 632 | const __m128i s1_18_0 = _mm_unpacklo_epi16(step3[18], step3[29]); |
michael@0 | 633 | const __m128i s1_18_1 = _mm_unpackhi_epi16(step3[18], step3[29]); |
michael@0 | 634 | const __m128i s1_19_0 = _mm_unpacklo_epi16(step3[19], step3[28]); |
michael@0 | 635 | const __m128i s1_19_1 = _mm_unpackhi_epi16(step3[19], step3[28]); |
michael@0 | 636 | const __m128i s1_20_0 = _mm_unpacklo_epi16(step3[20], step3[27]); |
michael@0 | 637 | const __m128i s1_20_1 = _mm_unpackhi_epi16(step3[20], step3[27]); |
michael@0 | 638 | const __m128i s1_21_0 = _mm_unpacklo_epi16(step3[21], step3[26]); |
michael@0 | 639 | const __m128i s1_21_1 = _mm_unpackhi_epi16(step3[21], step3[26]); |
michael@0 | 640 | const __m128i s1_18_2 = _mm_madd_epi16(s1_18_0, k__cospi_m08_p24); |
michael@0 | 641 | const __m128i s1_18_3 = _mm_madd_epi16(s1_18_1, k__cospi_m08_p24); |
michael@0 | 642 | const __m128i s1_19_2 = _mm_madd_epi16(s1_19_0, k__cospi_m08_p24); |
michael@0 | 643 | const __m128i s1_19_3 = _mm_madd_epi16(s1_19_1, k__cospi_m08_p24); |
michael@0 | 644 | const __m128i s1_20_2 = _mm_madd_epi16(s1_20_0, k__cospi_m24_m08); |
michael@0 | 645 | const __m128i s1_20_3 = _mm_madd_epi16(s1_20_1, k__cospi_m24_m08); |
michael@0 | 646 | const __m128i s1_21_2 = _mm_madd_epi16(s1_21_0, k__cospi_m24_m08); |
michael@0 | 647 | const __m128i s1_21_3 = _mm_madd_epi16(s1_21_1, k__cospi_m24_m08); |
michael@0 | 648 | const __m128i s1_26_2 = _mm_madd_epi16(s1_21_0, k__cospi_m08_p24); |
michael@0 | 649 | const __m128i s1_26_3 = _mm_madd_epi16(s1_21_1, k__cospi_m08_p24); |
michael@0 | 650 | const __m128i s1_27_2 = _mm_madd_epi16(s1_20_0, k__cospi_m08_p24); |
michael@0 | 651 | const __m128i s1_27_3 = _mm_madd_epi16(s1_20_1, k__cospi_m08_p24); |
michael@0 | 652 | const __m128i s1_28_2 = _mm_madd_epi16(s1_19_0, k__cospi_p24_p08); |
michael@0 | 653 | const __m128i s1_28_3 = _mm_madd_epi16(s1_19_1, k__cospi_p24_p08); |
michael@0 | 654 | const __m128i s1_29_2 = _mm_madd_epi16(s1_18_0, k__cospi_p24_p08); |
michael@0 | 655 | const __m128i s1_29_3 = _mm_madd_epi16(s1_18_1, k__cospi_p24_p08); |
michael@0 | 656 | // dct_const_round_shift |
michael@0 | 657 | const __m128i s1_18_4 = _mm_add_epi32(s1_18_2, k__DCT_CONST_ROUNDING); |
michael@0 | 658 | const __m128i s1_18_5 = _mm_add_epi32(s1_18_3, k__DCT_CONST_ROUNDING); |
michael@0 | 659 | const __m128i s1_19_4 = _mm_add_epi32(s1_19_2, k__DCT_CONST_ROUNDING); |
michael@0 | 660 | const __m128i s1_19_5 = _mm_add_epi32(s1_19_3, k__DCT_CONST_ROUNDING); |
michael@0 | 661 | const __m128i s1_20_4 = _mm_add_epi32(s1_20_2, k__DCT_CONST_ROUNDING); |
michael@0 | 662 | const __m128i s1_20_5 = _mm_add_epi32(s1_20_3, k__DCT_CONST_ROUNDING); |
michael@0 | 663 | const __m128i s1_21_4 = _mm_add_epi32(s1_21_2, k__DCT_CONST_ROUNDING); |
michael@0 | 664 | const __m128i s1_21_5 = _mm_add_epi32(s1_21_3, k__DCT_CONST_ROUNDING); |
michael@0 | 665 | const __m128i s1_26_4 = _mm_add_epi32(s1_26_2, k__DCT_CONST_ROUNDING); |
michael@0 | 666 | const __m128i s1_26_5 = _mm_add_epi32(s1_26_3, k__DCT_CONST_ROUNDING); |
michael@0 | 667 | const __m128i s1_27_4 = _mm_add_epi32(s1_27_2, k__DCT_CONST_ROUNDING); |
michael@0 | 668 | const __m128i s1_27_5 = _mm_add_epi32(s1_27_3, k__DCT_CONST_ROUNDING); |
michael@0 | 669 | const __m128i s1_28_4 = _mm_add_epi32(s1_28_2, k__DCT_CONST_ROUNDING); |
michael@0 | 670 | const __m128i s1_28_5 = _mm_add_epi32(s1_28_3, k__DCT_CONST_ROUNDING); |
michael@0 | 671 | const __m128i s1_29_4 = _mm_add_epi32(s1_29_2, k__DCT_CONST_ROUNDING); |
michael@0 | 672 | const __m128i s1_29_5 = _mm_add_epi32(s1_29_3, k__DCT_CONST_ROUNDING); |
michael@0 | 673 | const __m128i s1_18_6 = _mm_srai_epi32(s1_18_4, DCT_CONST_BITS); |
michael@0 | 674 | const __m128i s1_18_7 = _mm_srai_epi32(s1_18_5, DCT_CONST_BITS); |
michael@0 | 675 | const __m128i s1_19_6 = _mm_srai_epi32(s1_19_4, DCT_CONST_BITS); |
michael@0 | 676 | const __m128i s1_19_7 = _mm_srai_epi32(s1_19_5, DCT_CONST_BITS); |
michael@0 | 677 | const __m128i s1_20_6 = _mm_srai_epi32(s1_20_4, DCT_CONST_BITS); |
michael@0 | 678 | const __m128i s1_20_7 = _mm_srai_epi32(s1_20_5, DCT_CONST_BITS); |
michael@0 | 679 | const __m128i s1_21_6 = _mm_srai_epi32(s1_21_4, DCT_CONST_BITS); |
michael@0 | 680 | const __m128i s1_21_7 = _mm_srai_epi32(s1_21_5, DCT_CONST_BITS); |
michael@0 | 681 | const __m128i s1_26_6 = _mm_srai_epi32(s1_26_4, DCT_CONST_BITS); |
michael@0 | 682 | const __m128i s1_26_7 = _mm_srai_epi32(s1_26_5, DCT_CONST_BITS); |
michael@0 | 683 | const __m128i s1_27_6 = _mm_srai_epi32(s1_27_4, DCT_CONST_BITS); |
michael@0 | 684 | const __m128i s1_27_7 = _mm_srai_epi32(s1_27_5, DCT_CONST_BITS); |
michael@0 | 685 | const __m128i s1_28_6 = _mm_srai_epi32(s1_28_4, DCT_CONST_BITS); |
michael@0 | 686 | const __m128i s1_28_7 = _mm_srai_epi32(s1_28_5, DCT_CONST_BITS); |
michael@0 | 687 | const __m128i s1_29_6 = _mm_srai_epi32(s1_29_4, DCT_CONST_BITS); |
michael@0 | 688 | const __m128i s1_29_7 = _mm_srai_epi32(s1_29_5, DCT_CONST_BITS); |
michael@0 | 689 | // Combine |
michael@0 | 690 | step1[18] = _mm_packs_epi32(s1_18_6, s1_18_7); |
michael@0 | 691 | step1[19] = _mm_packs_epi32(s1_19_6, s1_19_7); |
michael@0 | 692 | step1[20] = _mm_packs_epi32(s1_20_6, s1_20_7); |
michael@0 | 693 | step1[21] = _mm_packs_epi32(s1_21_6, s1_21_7); |
michael@0 | 694 | step1[26] = _mm_packs_epi32(s1_26_6, s1_26_7); |
michael@0 | 695 | step1[27] = _mm_packs_epi32(s1_27_6, s1_27_7); |
michael@0 | 696 | step1[28] = _mm_packs_epi32(s1_28_6, s1_28_7); |
michael@0 | 697 | step1[29] = _mm_packs_epi32(s1_29_6, s1_29_7); |
michael@0 | 698 | } |
michael@0 | 699 | // Stage 5 |
michael@0 | 700 | { |
michael@0 | 701 | step2[4] = _mm_add_epi16(step1[5], step3[4]); |
michael@0 | 702 | step2[5] = _mm_sub_epi16(step3[4], step1[5]); |
michael@0 | 703 | step2[6] = _mm_sub_epi16(step3[7], step1[6]); |
michael@0 | 704 | step2[7] = _mm_add_epi16(step1[6], step3[7]); |
michael@0 | 705 | } |
michael@0 | 706 | { |
michael@0 | 707 | const __m128i out_00_0 = _mm_unpacklo_epi16(step1[0], step1[1]); |
michael@0 | 708 | const __m128i out_00_1 = _mm_unpackhi_epi16(step1[0], step1[1]); |
michael@0 | 709 | const __m128i out_08_0 = _mm_unpacklo_epi16(step1[2], step1[3]); |
michael@0 | 710 | const __m128i out_08_1 = _mm_unpackhi_epi16(step1[2], step1[3]); |
michael@0 | 711 | const __m128i out_00_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_p16); |
michael@0 | 712 | const __m128i out_00_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_p16); |
michael@0 | 713 | const __m128i out_16_2 = _mm_madd_epi16(out_00_0, k__cospi_p16_m16); |
michael@0 | 714 | const __m128i out_16_3 = _mm_madd_epi16(out_00_1, k__cospi_p16_m16); |
michael@0 | 715 | const __m128i out_08_2 = _mm_madd_epi16(out_08_0, k__cospi_p24_p08); |
michael@0 | 716 | const __m128i out_08_3 = _mm_madd_epi16(out_08_1, k__cospi_p24_p08); |
michael@0 | 717 | const __m128i out_24_2 = _mm_madd_epi16(out_08_0, k__cospi_m08_p24); |
michael@0 | 718 | const __m128i out_24_3 = _mm_madd_epi16(out_08_1, k__cospi_m08_p24); |
michael@0 | 719 | // dct_const_round_shift |
michael@0 | 720 | const __m128i out_00_4 = _mm_add_epi32(out_00_2, k__DCT_CONST_ROUNDING); |
michael@0 | 721 | const __m128i out_00_5 = _mm_add_epi32(out_00_3, k__DCT_CONST_ROUNDING); |
michael@0 | 722 | const __m128i out_16_4 = _mm_add_epi32(out_16_2, k__DCT_CONST_ROUNDING); |
michael@0 | 723 | const __m128i out_16_5 = _mm_add_epi32(out_16_3, k__DCT_CONST_ROUNDING); |
michael@0 | 724 | const __m128i out_08_4 = _mm_add_epi32(out_08_2, k__DCT_CONST_ROUNDING); |
michael@0 | 725 | const __m128i out_08_5 = _mm_add_epi32(out_08_3, k__DCT_CONST_ROUNDING); |
michael@0 | 726 | const __m128i out_24_4 = _mm_add_epi32(out_24_2, k__DCT_CONST_ROUNDING); |
michael@0 | 727 | const __m128i out_24_5 = _mm_add_epi32(out_24_3, k__DCT_CONST_ROUNDING); |
michael@0 | 728 | const __m128i out_00_6 = _mm_srai_epi32(out_00_4, DCT_CONST_BITS); |
michael@0 | 729 | const __m128i out_00_7 = _mm_srai_epi32(out_00_5, DCT_CONST_BITS); |
michael@0 | 730 | const __m128i out_16_6 = _mm_srai_epi32(out_16_4, DCT_CONST_BITS); |
michael@0 | 731 | const __m128i out_16_7 = _mm_srai_epi32(out_16_5, DCT_CONST_BITS); |
michael@0 | 732 | const __m128i out_08_6 = _mm_srai_epi32(out_08_4, DCT_CONST_BITS); |
michael@0 | 733 | const __m128i out_08_7 = _mm_srai_epi32(out_08_5, DCT_CONST_BITS); |
michael@0 | 734 | const __m128i out_24_6 = _mm_srai_epi32(out_24_4, DCT_CONST_BITS); |
michael@0 | 735 | const __m128i out_24_7 = _mm_srai_epi32(out_24_5, DCT_CONST_BITS); |
michael@0 | 736 | // Combine |
michael@0 | 737 | out[ 0] = _mm_packs_epi32(out_00_6, out_00_7); |
michael@0 | 738 | out[16] = _mm_packs_epi32(out_16_6, out_16_7); |
michael@0 | 739 | out[ 8] = _mm_packs_epi32(out_08_6, out_08_7); |
michael@0 | 740 | out[24] = _mm_packs_epi32(out_24_6, out_24_7); |
michael@0 | 741 | } |
michael@0 | 742 | { |
michael@0 | 743 | const __m128i s2_09_0 = _mm_unpacklo_epi16(step1[ 9], step1[14]); |
michael@0 | 744 | const __m128i s2_09_1 = _mm_unpackhi_epi16(step1[ 9], step1[14]); |
michael@0 | 745 | const __m128i s2_10_0 = _mm_unpacklo_epi16(step1[10], step1[13]); |
michael@0 | 746 | const __m128i s2_10_1 = _mm_unpackhi_epi16(step1[10], step1[13]); |
michael@0 | 747 | const __m128i s2_09_2 = _mm_madd_epi16(s2_09_0, k__cospi_m08_p24); |
michael@0 | 748 | const __m128i s2_09_3 = _mm_madd_epi16(s2_09_1, k__cospi_m08_p24); |
michael@0 | 749 | const __m128i s2_10_2 = _mm_madd_epi16(s2_10_0, k__cospi_m24_m08); |
michael@0 | 750 | const __m128i s2_10_3 = _mm_madd_epi16(s2_10_1, k__cospi_m24_m08); |
michael@0 | 751 | const __m128i s2_13_2 = _mm_madd_epi16(s2_10_0, k__cospi_m08_p24); |
michael@0 | 752 | const __m128i s2_13_3 = _mm_madd_epi16(s2_10_1, k__cospi_m08_p24); |
michael@0 | 753 | const __m128i s2_14_2 = _mm_madd_epi16(s2_09_0, k__cospi_p24_p08); |
michael@0 | 754 | const __m128i s2_14_3 = _mm_madd_epi16(s2_09_1, k__cospi_p24_p08); |
michael@0 | 755 | // dct_const_round_shift |
michael@0 | 756 | const __m128i s2_09_4 = _mm_add_epi32(s2_09_2, k__DCT_CONST_ROUNDING); |
michael@0 | 757 | const __m128i s2_09_5 = _mm_add_epi32(s2_09_3, k__DCT_CONST_ROUNDING); |
michael@0 | 758 | const __m128i s2_10_4 = _mm_add_epi32(s2_10_2, k__DCT_CONST_ROUNDING); |
michael@0 | 759 | const __m128i s2_10_5 = _mm_add_epi32(s2_10_3, k__DCT_CONST_ROUNDING); |
michael@0 | 760 | const __m128i s2_13_4 = _mm_add_epi32(s2_13_2, k__DCT_CONST_ROUNDING); |
michael@0 | 761 | const __m128i s2_13_5 = _mm_add_epi32(s2_13_3, k__DCT_CONST_ROUNDING); |
michael@0 | 762 | const __m128i s2_14_4 = _mm_add_epi32(s2_14_2, k__DCT_CONST_ROUNDING); |
michael@0 | 763 | const __m128i s2_14_5 = _mm_add_epi32(s2_14_3, k__DCT_CONST_ROUNDING); |
michael@0 | 764 | const __m128i s2_09_6 = _mm_srai_epi32(s2_09_4, DCT_CONST_BITS); |
michael@0 | 765 | const __m128i s2_09_7 = _mm_srai_epi32(s2_09_5, DCT_CONST_BITS); |
michael@0 | 766 | const __m128i s2_10_6 = _mm_srai_epi32(s2_10_4, DCT_CONST_BITS); |
michael@0 | 767 | const __m128i s2_10_7 = _mm_srai_epi32(s2_10_5, DCT_CONST_BITS); |
michael@0 | 768 | const __m128i s2_13_6 = _mm_srai_epi32(s2_13_4, DCT_CONST_BITS); |
michael@0 | 769 | const __m128i s2_13_7 = _mm_srai_epi32(s2_13_5, DCT_CONST_BITS); |
michael@0 | 770 | const __m128i s2_14_6 = _mm_srai_epi32(s2_14_4, DCT_CONST_BITS); |
michael@0 | 771 | const __m128i s2_14_7 = _mm_srai_epi32(s2_14_5, DCT_CONST_BITS); |
michael@0 | 772 | // Combine |
michael@0 | 773 | step2[ 9] = _mm_packs_epi32(s2_09_6, s2_09_7); |
michael@0 | 774 | step2[10] = _mm_packs_epi32(s2_10_6, s2_10_7); |
michael@0 | 775 | step2[13] = _mm_packs_epi32(s2_13_6, s2_13_7); |
michael@0 | 776 | step2[14] = _mm_packs_epi32(s2_14_6, s2_14_7); |
michael@0 | 777 | } |
michael@0 | 778 | { |
michael@0 | 779 | step2[16] = _mm_add_epi16(step1[19], step3[16]); |
michael@0 | 780 | step2[17] = _mm_add_epi16(step1[18], step3[17]); |
michael@0 | 781 | step2[18] = _mm_sub_epi16(step3[17], step1[18]); |
michael@0 | 782 | step2[19] = _mm_sub_epi16(step3[16], step1[19]); |
michael@0 | 783 | step2[20] = _mm_sub_epi16(step3[23], step1[20]); |
michael@0 | 784 | step2[21] = _mm_sub_epi16(step3[22], step1[21]); |
michael@0 | 785 | step2[22] = _mm_add_epi16(step1[21], step3[22]); |
michael@0 | 786 | step2[23] = _mm_add_epi16(step1[20], step3[23]); |
michael@0 | 787 | step2[24] = _mm_add_epi16(step1[27], step3[24]); |
michael@0 | 788 | step2[25] = _mm_add_epi16(step1[26], step3[25]); |
michael@0 | 789 | step2[26] = _mm_sub_epi16(step3[25], step1[26]); |
michael@0 | 790 | step2[27] = _mm_sub_epi16(step3[24], step1[27]); |
michael@0 | 791 | step2[28] = _mm_sub_epi16(step3[31], step1[28]); |
michael@0 | 792 | step2[29] = _mm_sub_epi16(step3[30], step1[29]); |
michael@0 | 793 | step2[30] = _mm_add_epi16(step1[29], step3[30]); |
michael@0 | 794 | step2[31] = _mm_add_epi16(step1[28], step3[31]); |
michael@0 | 795 | } |
michael@0 | 796 | // Stage 6 |
michael@0 | 797 | { |
michael@0 | 798 | const __m128i out_04_0 = _mm_unpacklo_epi16(step2[4], step2[7]); |
michael@0 | 799 | const __m128i out_04_1 = _mm_unpackhi_epi16(step2[4], step2[7]); |
michael@0 | 800 | const __m128i out_20_0 = _mm_unpacklo_epi16(step2[5], step2[6]); |
michael@0 | 801 | const __m128i out_20_1 = _mm_unpackhi_epi16(step2[5], step2[6]); |
michael@0 | 802 | const __m128i out_12_0 = _mm_unpacklo_epi16(step2[5], step2[6]); |
michael@0 | 803 | const __m128i out_12_1 = _mm_unpackhi_epi16(step2[5], step2[6]); |
michael@0 | 804 | const __m128i out_28_0 = _mm_unpacklo_epi16(step2[4], step2[7]); |
michael@0 | 805 | const __m128i out_28_1 = _mm_unpackhi_epi16(step2[4], step2[7]); |
michael@0 | 806 | const __m128i out_04_2 = _mm_madd_epi16(out_04_0, k__cospi_p28_p04); |
michael@0 | 807 | const __m128i out_04_3 = _mm_madd_epi16(out_04_1, k__cospi_p28_p04); |
michael@0 | 808 | const __m128i out_20_2 = _mm_madd_epi16(out_20_0, k__cospi_p12_p20); |
michael@0 | 809 | const __m128i out_20_3 = _mm_madd_epi16(out_20_1, k__cospi_p12_p20); |
michael@0 | 810 | const __m128i out_12_2 = _mm_madd_epi16(out_12_0, k__cospi_m20_p12); |
michael@0 | 811 | const __m128i out_12_3 = _mm_madd_epi16(out_12_1, k__cospi_m20_p12); |
michael@0 | 812 | const __m128i out_28_2 = _mm_madd_epi16(out_28_0, k__cospi_m04_p28); |
michael@0 | 813 | const __m128i out_28_3 = _mm_madd_epi16(out_28_1, k__cospi_m04_p28); |
michael@0 | 814 | // dct_const_round_shift |
michael@0 | 815 | const __m128i out_04_4 = _mm_add_epi32(out_04_2, k__DCT_CONST_ROUNDING); |
michael@0 | 816 | const __m128i out_04_5 = _mm_add_epi32(out_04_3, k__DCT_CONST_ROUNDING); |
michael@0 | 817 | const __m128i out_20_4 = _mm_add_epi32(out_20_2, k__DCT_CONST_ROUNDING); |
michael@0 | 818 | const __m128i out_20_5 = _mm_add_epi32(out_20_3, k__DCT_CONST_ROUNDING); |
michael@0 | 819 | const __m128i out_12_4 = _mm_add_epi32(out_12_2, k__DCT_CONST_ROUNDING); |
michael@0 | 820 | const __m128i out_12_5 = _mm_add_epi32(out_12_3, k__DCT_CONST_ROUNDING); |
michael@0 | 821 | const __m128i out_28_4 = _mm_add_epi32(out_28_2, k__DCT_CONST_ROUNDING); |
michael@0 | 822 | const __m128i out_28_5 = _mm_add_epi32(out_28_3, k__DCT_CONST_ROUNDING); |
michael@0 | 823 | const __m128i out_04_6 = _mm_srai_epi32(out_04_4, DCT_CONST_BITS); |
michael@0 | 824 | const __m128i out_04_7 = _mm_srai_epi32(out_04_5, DCT_CONST_BITS); |
michael@0 | 825 | const __m128i out_20_6 = _mm_srai_epi32(out_20_4, DCT_CONST_BITS); |
michael@0 | 826 | const __m128i out_20_7 = _mm_srai_epi32(out_20_5, DCT_CONST_BITS); |
michael@0 | 827 | const __m128i out_12_6 = _mm_srai_epi32(out_12_4, DCT_CONST_BITS); |
michael@0 | 828 | const __m128i out_12_7 = _mm_srai_epi32(out_12_5, DCT_CONST_BITS); |
michael@0 | 829 | const __m128i out_28_6 = _mm_srai_epi32(out_28_4, DCT_CONST_BITS); |
michael@0 | 830 | const __m128i out_28_7 = _mm_srai_epi32(out_28_5, DCT_CONST_BITS); |
michael@0 | 831 | // Combine |
michael@0 | 832 | out[ 4] = _mm_packs_epi32(out_04_6, out_04_7); |
michael@0 | 833 | out[20] = _mm_packs_epi32(out_20_6, out_20_7); |
michael@0 | 834 | out[12] = _mm_packs_epi32(out_12_6, out_12_7); |
michael@0 | 835 | out[28] = _mm_packs_epi32(out_28_6, out_28_7); |
michael@0 | 836 | } |
michael@0 | 837 | { |
michael@0 | 838 | step3[ 8] = _mm_add_epi16(step2[ 9], step1[ 8]); |
michael@0 | 839 | step3[ 9] = _mm_sub_epi16(step1[ 8], step2[ 9]); |
michael@0 | 840 | step3[10] = _mm_sub_epi16(step1[11], step2[10]); |
michael@0 | 841 | step3[11] = _mm_add_epi16(step2[10], step1[11]); |
michael@0 | 842 | step3[12] = _mm_add_epi16(step2[13], step1[12]); |
michael@0 | 843 | step3[13] = _mm_sub_epi16(step1[12], step2[13]); |
michael@0 | 844 | step3[14] = _mm_sub_epi16(step1[15], step2[14]); |
michael@0 | 845 | step3[15] = _mm_add_epi16(step2[14], step1[15]); |
michael@0 | 846 | } |
michael@0 | 847 | { |
michael@0 | 848 | const __m128i s3_17_0 = _mm_unpacklo_epi16(step2[17], step2[30]); |
michael@0 | 849 | const __m128i s3_17_1 = _mm_unpackhi_epi16(step2[17], step2[30]); |
michael@0 | 850 | const __m128i s3_18_0 = _mm_unpacklo_epi16(step2[18], step2[29]); |
michael@0 | 851 | const __m128i s3_18_1 = _mm_unpackhi_epi16(step2[18], step2[29]); |
michael@0 | 852 | const __m128i s3_21_0 = _mm_unpacklo_epi16(step2[21], step2[26]); |
michael@0 | 853 | const __m128i s3_21_1 = _mm_unpackhi_epi16(step2[21], step2[26]); |
michael@0 | 854 | const __m128i s3_22_0 = _mm_unpacklo_epi16(step2[22], step2[25]); |
michael@0 | 855 | const __m128i s3_22_1 = _mm_unpackhi_epi16(step2[22], step2[25]); |
michael@0 | 856 | const __m128i s3_17_2 = _mm_madd_epi16(s3_17_0, k__cospi_m04_p28); |
michael@0 | 857 | const __m128i s3_17_3 = _mm_madd_epi16(s3_17_1, k__cospi_m04_p28); |
michael@0 | 858 | const __m128i s3_18_2 = _mm_madd_epi16(s3_18_0, k__cospi_m28_m04); |
michael@0 | 859 | const __m128i s3_18_3 = _mm_madd_epi16(s3_18_1, k__cospi_m28_m04); |
michael@0 | 860 | const __m128i s3_21_2 = _mm_madd_epi16(s3_21_0, k__cospi_m20_p12); |
michael@0 | 861 | const __m128i s3_21_3 = _mm_madd_epi16(s3_21_1, k__cospi_m20_p12); |
michael@0 | 862 | const __m128i s3_22_2 = _mm_madd_epi16(s3_22_0, k__cospi_m12_m20); |
michael@0 | 863 | const __m128i s3_22_3 = _mm_madd_epi16(s3_22_1, k__cospi_m12_m20); |
michael@0 | 864 | const __m128i s3_25_2 = _mm_madd_epi16(s3_22_0, k__cospi_m20_p12); |
michael@0 | 865 | const __m128i s3_25_3 = _mm_madd_epi16(s3_22_1, k__cospi_m20_p12); |
michael@0 | 866 | const __m128i s3_26_2 = _mm_madd_epi16(s3_21_0, k__cospi_p12_p20); |
michael@0 | 867 | const __m128i s3_26_3 = _mm_madd_epi16(s3_21_1, k__cospi_p12_p20); |
michael@0 | 868 | const __m128i s3_29_2 = _mm_madd_epi16(s3_18_0, k__cospi_m04_p28); |
michael@0 | 869 | const __m128i s3_29_3 = _mm_madd_epi16(s3_18_1, k__cospi_m04_p28); |
michael@0 | 870 | const __m128i s3_30_2 = _mm_madd_epi16(s3_17_0, k__cospi_p28_p04); |
michael@0 | 871 | const __m128i s3_30_3 = _mm_madd_epi16(s3_17_1, k__cospi_p28_p04); |
michael@0 | 872 | // dct_const_round_shift |
michael@0 | 873 | const __m128i s3_17_4 = _mm_add_epi32(s3_17_2, k__DCT_CONST_ROUNDING); |
michael@0 | 874 | const __m128i s3_17_5 = _mm_add_epi32(s3_17_3, k__DCT_CONST_ROUNDING); |
michael@0 | 875 | const __m128i s3_18_4 = _mm_add_epi32(s3_18_2, k__DCT_CONST_ROUNDING); |
michael@0 | 876 | const __m128i s3_18_5 = _mm_add_epi32(s3_18_3, k__DCT_CONST_ROUNDING); |
michael@0 | 877 | const __m128i s3_21_4 = _mm_add_epi32(s3_21_2, k__DCT_CONST_ROUNDING); |
michael@0 | 878 | const __m128i s3_21_5 = _mm_add_epi32(s3_21_3, k__DCT_CONST_ROUNDING); |
michael@0 | 879 | const __m128i s3_22_4 = _mm_add_epi32(s3_22_2, k__DCT_CONST_ROUNDING); |
michael@0 | 880 | const __m128i s3_22_5 = _mm_add_epi32(s3_22_3, k__DCT_CONST_ROUNDING); |
michael@0 | 881 | const __m128i s3_17_6 = _mm_srai_epi32(s3_17_4, DCT_CONST_BITS); |
michael@0 | 882 | const __m128i s3_17_7 = _mm_srai_epi32(s3_17_5, DCT_CONST_BITS); |
michael@0 | 883 | const __m128i s3_18_6 = _mm_srai_epi32(s3_18_4, DCT_CONST_BITS); |
michael@0 | 884 | const __m128i s3_18_7 = _mm_srai_epi32(s3_18_5, DCT_CONST_BITS); |
michael@0 | 885 | const __m128i s3_21_6 = _mm_srai_epi32(s3_21_4, DCT_CONST_BITS); |
michael@0 | 886 | const __m128i s3_21_7 = _mm_srai_epi32(s3_21_5, DCT_CONST_BITS); |
michael@0 | 887 | const __m128i s3_22_6 = _mm_srai_epi32(s3_22_4, DCT_CONST_BITS); |
michael@0 | 888 | const __m128i s3_22_7 = _mm_srai_epi32(s3_22_5, DCT_CONST_BITS); |
michael@0 | 889 | const __m128i s3_25_4 = _mm_add_epi32(s3_25_2, k__DCT_CONST_ROUNDING); |
michael@0 | 890 | const __m128i s3_25_5 = _mm_add_epi32(s3_25_3, k__DCT_CONST_ROUNDING); |
michael@0 | 891 | const __m128i s3_26_4 = _mm_add_epi32(s3_26_2, k__DCT_CONST_ROUNDING); |
michael@0 | 892 | const __m128i s3_26_5 = _mm_add_epi32(s3_26_3, k__DCT_CONST_ROUNDING); |
michael@0 | 893 | const __m128i s3_29_4 = _mm_add_epi32(s3_29_2, k__DCT_CONST_ROUNDING); |
michael@0 | 894 | const __m128i s3_29_5 = _mm_add_epi32(s3_29_3, k__DCT_CONST_ROUNDING); |
michael@0 | 895 | const __m128i s3_30_4 = _mm_add_epi32(s3_30_2, k__DCT_CONST_ROUNDING); |
michael@0 | 896 | const __m128i s3_30_5 = _mm_add_epi32(s3_30_3, k__DCT_CONST_ROUNDING); |
michael@0 | 897 | const __m128i s3_25_6 = _mm_srai_epi32(s3_25_4, DCT_CONST_BITS); |
michael@0 | 898 | const __m128i s3_25_7 = _mm_srai_epi32(s3_25_5, DCT_CONST_BITS); |
michael@0 | 899 | const __m128i s3_26_6 = _mm_srai_epi32(s3_26_4, DCT_CONST_BITS); |
michael@0 | 900 | const __m128i s3_26_7 = _mm_srai_epi32(s3_26_5, DCT_CONST_BITS); |
michael@0 | 901 | const __m128i s3_29_6 = _mm_srai_epi32(s3_29_4, DCT_CONST_BITS); |
michael@0 | 902 | const __m128i s3_29_7 = _mm_srai_epi32(s3_29_5, DCT_CONST_BITS); |
michael@0 | 903 | const __m128i s3_30_6 = _mm_srai_epi32(s3_30_4, DCT_CONST_BITS); |
michael@0 | 904 | const __m128i s3_30_7 = _mm_srai_epi32(s3_30_5, DCT_CONST_BITS); |
michael@0 | 905 | // Combine |
michael@0 | 906 | step3[17] = _mm_packs_epi32(s3_17_6, s3_17_7); |
michael@0 | 907 | step3[18] = _mm_packs_epi32(s3_18_6, s3_18_7); |
michael@0 | 908 | step3[21] = _mm_packs_epi32(s3_21_6, s3_21_7); |
michael@0 | 909 | step3[22] = _mm_packs_epi32(s3_22_6, s3_22_7); |
michael@0 | 910 | // Combine |
michael@0 | 911 | step3[25] = _mm_packs_epi32(s3_25_6, s3_25_7); |
michael@0 | 912 | step3[26] = _mm_packs_epi32(s3_26_6, s3_26_7); |
michael@0 | 913 | step3[29] = _mm_packs_epi32(s3_29_6, s3_29_7); |
michael@0 | 914 | step3[30] = _mm_packs_epi32(s3_30_6, s3_30_7); |
michael@0 | 915 | } |
michael@0 | 916 | // Stage 7 |
michael@0 | 917 | { |
michael@0 | 918 | const __m128i out_02_0 = _mm_unpacklo_epi16(step3[ 8], step3[15]); |
michael@0 | 919 | const __m128i out_02_1 = _mm_unpackhi_epi16(step3[ 8], step3[15]); |
michael@0 | 920 | const __m128i out_18_0 = _mm_unpacklo_epi16(step3[ 9], step3[14]); |
michael@0 | 921 | const __m128i out_18_1 = _mm_unpackhi_epi16(step3[ 9], step3[14]); |
michael@0 | 922 | const __m128i out_10_0 = _mm_unpacklo_epi16(step3[10], step3[13]); |
michael@0 | 923 | const __m128i out_10_1 = _mm_unpackhi_epi16(step3[10], step3[13]); |
michael@0 | 924 | const __m128i out_26_0 = _mm_unpacklo_epi16(step3[11], step3[12]); |
michael@0 | 925 | const __m128i out_26_1 = _mm_unpackhi_epi16(step3[11], step3[12]); |
michael@0 | 926 | const __m128i out_02_2 = _mm_madd_epi16(out_02_0, k__cospi_p30_p02); |
michael@0 | 927 | const __m128i out_02_3 = _mm_madd_epi16(out_02_1, k__cospi_p30_p02); |
michael@0 | 928 | const __m128i out_18_2 = _mm_madd_epi16(out_18_0, k__cospi_p14_p18); |
michael@0 | 929 | const __m128i out_18_3 = _mm_madd_epi16(out_18_1, k__cospi_p14_p18); |
michael@0 | 930 | const __m128i out_10_2 = _mm_madd_epi16(out_10_0, k__cospi_p22_p10); |
michael@0 | 931 | const __m128i out_10_3 = _mm_madd_epi16(out_10_1, k__cospi_p22_p10); |
michael@0 | 932 | const __m128i out_26_2 = _mm_madd_epi16(out_26_0, k__cospi_p06_p26); |
michael@0 | 933 | const __m128i out_26_3 = _mm_madd_epi16(out_26_1, k__cospi_p06_p26); |
michael@0 | 934 | const __m128i out_06_2 = _mm_madd_epi16(out_26_0, k__cospi_m26_p06); |
michael@0 | 935 | const __m128i out_06_3 = _mm_madd_epi16(out_26_1, k__cospi_m26_p06); |
michael@0 | 936 | const __m128i out_22_2 = _mm_madd_epi16(out_10_0, k__cospi_m10_p22); |
michael@0 | 937 | const __m128i out_22_3 = _mm_madd_epi16(out_10_1, k__cospi_m10_p22); |
michael@0 | 938 | const __m128i out_14_2 = _mm_madd_epi16(out_18_0, k__cospi_m18_p14); |
michael@0 | 939 | const __m128i out_14_3 = _mm_madd_epi16(out_18_1, k__cospi_m18_p14); |
michael@0 | 940 | const __m128i out_30_2 = _mm_madd_epi16(out_02_0, k__cospi_m02_p30); |
michael@0 | 941 | const __m128i out_30_3 = _mm_madd_epi16(out_02_1, k__cospi_m02_p30); |
michael@0 | 942 | // dct_const_round_shift |
michael@0 | 943 | const __m128i out_02_4 = _mm_add_epi32(out_02_2, k__DCT_CONST_ROUNDING); |
michael@0 | 944 | const __m128i out_02_5 = _mm_add_epi32(out_02_3, k__DCT_CONST_ROUNDING); |
michael@0 | 945 | const __m128i out_18_4 = _mm_add_epi32(out_18_2, k__DCT_CONST_ROUNDING); |
michael@0 | 946 | const __m128i out_18_5 = _mm_add_epi32(out_18_3, k__DCT_CONST_ROUNDING); |
michael@0 | 947 | const __m128i out_10_4 = _mm_add_epi32(out_10_2, k__DCT_CONST_ROUNDING); |
michael@0 | 948 | const __m128i out_10_5 = _mm_add_epi32(out_10_3, k__DCT_CONST_ROUNDING); |
michael@0 | 949 | const __m128i out_26_4 = _mm_add_epi32(out_26_2, k__DCT_CONST_ROUNDING); |
michael@0 | 950 | const __m128i out_26_5 = _mm_add_epi32(out_26_3, k__DCT_CONST_ROUNDING); |
michael@0 | 951 | const __m128i out_06_4 = _mm_add_epi32(out_06_2, k__DCT_CONST_ROUNDING); |
michael@0 | 952 | const __m128i out_06_5 = _mm_add_epi32(out_06_3, k__DCT_CONST_ROUNDING); |
michael@0 | 953 | const __m128i out_22_4 = _mm_add_epi32(out_22_2, k__DCT_CONST_ROUNDING); |
michael@0 | 954 | const __m128i out_22_5 = _mm_add_epi32(out_22_3, k__DCT_CONST_ROUNDING); |
michael@0 | 955 | const __m128i out_14_4 = _mm_add_epi32(out_14_2, k__DCT_CONST_ROUNDING); |
michael@0 | 956 | const __m128i out_14_5 = _mm_add_epi32(out_14_3, k__DCT_CONST_ROUNDING); |
michael@0 | 957 | const __m128i out_30_4 = _mm_add_epi32(out_30_2, k__DCT_CONST_ROUNDING); |
michael@0 | 958 | const __m128i out_30_5 = _mm_add_epi32(out_30_3, k__DCT_CONST_ROUNDING); |
michael@0 | 959 | const __m128i out_02_6 = _mm_srai_epi32(out_02_4, DCT_CONST_BITS); |
michael@0 | 960 | const __m128i out_02_7 = _mm_srai_epi32(out_02_5, DCT_CONST_BITS); |
michael@0 | 961 | const __m128i out_18_6 = _mm_srai_epi32(out_18_4, DCT_CONST_BITS); |
michael@0 | 962 | const __m128i out_18_7 = _mm_srai_epi32(out_18_5, DCT_CONST_BITS); |
michael@0 | 963 | const __m128i out_10_6 = _mm_srai_epi32(out_10_4, DCT_CONST_BITS); |
michael@0 | 964 | const __m128i out_10_7 = _mm_srai_epi32(out_10_5, DCT_CONST_BITS); |
michael@0 | 965 | const __m128i out_26_6 = _mm_srai_epi32(out_26_4, DCT_CONST_BITS); |
michael@0 | 966 | const __m128i out_26_7 = _mm_srai_epi32(out_26_5, DCT_CONST_BITS); |
michael@0 | 967 | const __m128i out_06_6 = _mm_srai_epi32(out_06_4, DCT_CONST_BITS); |
michael@0 | 968 | const __m128i out_06_7 = _mm_srai_epi32(out_06_5, DCT_CONST_BITS); |
michael@0 | 969 | const __m128i out_22_6 = _mm_srai_epi32(out_22_4, DCT_CONST_BITS); |
michael@0 | 970 | const __m128i out_22_7 = _mm_srai_epi32(out_22_5, DCT_CONST_BITS); |
michael@0 | 971 | const __m128i out_14_6 = _mm_srai_epi32(out_14_4, DCT_CONST_BITS); |
michael@0 | 972 | const __m128i out_14_7 = _mm_srai_epi32(out_14_5, DCT_CONST_BITS); |
michael@0 | 973 | const __m128i out_30_6 = _mm_srai_epi32(out_30_4, DCT_CONST_BITS); |
michael@0 | 974 | const __m128i out_30_7 = _mm_srai_epi32(out_30_5, DCT_CONST_BITS); |
michael@0 | 975 | // Combine |
michael@0 | 976 | out[ 2] = _mm_packs_epi32(out_02_6, out_02_7); |
michael@0 | 977 | out[18] = _mm_packs_epi32(out_18_6, out_18_7); |
michael@0 | 978 | out[10] = _mm_packs_epi32(out_10_6, out_10_7); |
michael@0 | 979 | out[26] = _mm_packs_epi32(out_26_6, out_26_7); |
michael@0 | 980 | out[ 6] = _mm_packs_epi32(out_06_6, out_06_7); |
michael@0 | 981 | out[22] = _mm_packs_epi32(out_22_6, out_22_7); |
michael@0 | 982 | out[14] = _mm_packs_epi32(out_14_6, out_14_7); |
michael@0 | 983 | out[30] = _mm_packs_epi32(out_30_6, out_30_7); |
michael@0 | 984 | } |
michael@0 | 985 | { |
michael@0 | 986 | step1[16] = _mm_add_epi16(step3[17], step2[16]); |
michael@0 | 987 | step1[17] = _mm_sub_epi16(step2[16], step3[17]); |
michael@0 | 988 | step1[18] = _mm_sub_epi16(step2[19], step3[18]); |
michael@0 | 989 | step1[19] = _mm_add_epi16(step3[18], step2[19]); |
michael@0 | 990 | step1[20] = _mm_add_epi16(step3[21], step2[20]); |
michael@0 | 991 | step1[21] = _mm_sub_epi16(step2[20], step3[21]); |
michael@0 | 992 | step1[22] = _mm_sub_epi16(step2[23], step3[22]); |
michael@0 | 993 | step1[23] = _mm_add_epi16(step3[22], step2[23]); |
michael@0 | 994 | step1[24] = _mm_add_epi16(step3[25], step2[24]); |
michael@0 | 995 | step1[25] = _mm_sub_epi16(step2[24], step3[25]); |
michael@0 | 996 | step1[26] = _mm_sub_epi16(step2[27], step3[26]); |
michael@0 | 997 | step1[27] = _mm_add_epi16(step3[26], step2[27]); |
michael@0 | 998 | step1[28] = _mm_add_epi16(step3[29], step2[28]); |
michael@0 | 999 | step1[29] = _mm_sub_epi16(step2[28], step3[29]); |
michael@0 | 1000 | step1[30] = _mm_sub_epi16(step2[31], step3[30]); |
michael@0 | 1001 | step1[31] = _mm_add_epi16(step3[30], step2[31]); |
michael@0 | 1002 | } |
michael@0 | 1003 | // Final stage --- outputs indices are bit-reversed. |
michael@0 | 1004 | { |
michael@0 | 1005 | const __m128i out_01_0 = _mm_unpacklo_epi16(step1[16], step1[31]); |
michael@0 | 1006 | const __m128i out_01_1 = _mm_unpackhi_epi16(step1[16], step1[31]); |
michael@0 | 1007 | const __m128i out_17_0 = _mm_unpacklo_epi16(step1[17], step1[30]); |
michael@0 | 1008 | const __m128i out_17_1 = _mm_unpackhi_epi16(step1[17], step1[30]); |
michael@0 | 1009 | const __m128i out_09_0 = _mm_unpacklo_epi16(step1[18], step1[29]); |
michael@0 | 1010 | const __m128i out_09_1 = _mm_unpackhi_epi16(step1[18], step1[29]); |
michael@0 | 1011 | const __m128i out_25_0 = _mm_unpacklo_epi16(step1[19], step1[28]); |
michael@0 | 1012 | const __m128i out_25_1 = _mm_unpackhi_epi16(step1[19], step1[28]); |
michael@0 | 1013 | const __m128i out_01_2 = _mm_madd_epi16(out_01_0, k__cospi_p31_p01); |
michael@0 | 1014 | const __m128i out_01_3 = _mm_madd_epi16(out_01_1, k__cospi_p31_p01); |
michael@0 | 1015 | const __m128i out_17_2 = _mm_madd_epi16(out_17_0, k__cospi_p15_p17); |
michael@0 | 1016 | const __m128i out_17_3 = _mm_madd_epi16(out_17_1, k__cospi_p15_p17); |
michael@0 | 1017 | const __m128i out_09_2 = _mm_madd_epi16(out_09_0, k__cospi_p23_p09); |
michael@0 | 1018 | const __m128i out_09_3 = _mm_madd_epi16(out_09_1, k__cospi_p23_p09); |
michael@0 | 1019 | const __m128i out_25_2 = _mm_madd_epi16(out_25_0, k__cospi_p07_p25); |
michael@0 | 1020 | const __m128i out_25_3 = _mm_madd_epi16(out_25_1, k__cospi_p07_p25); |
michael@0 | 1021 | const __m128i out_07_2 = _mm_madd_epi16(out_25_0, k__cospi_m25_p07); |
michael@0 | 1022 | const __m128i out_07_3 = _mm_madd_epi16(out_25_1, k__cospi_m25_p07); |
michael@0 | 1023 | const __m128i out_23_2 = _mm_madd_epi16(out_09_0, k__cospi_m09_p23); |
michael@0 | 1024 | const __m128i out_23_3 = _mm_madd_epi16(out_09_1, k__cospi_m09_p23); |
michael@0 | 1025 | const __m128i out_15_2 = _mm_madd_epi16(out_17_0, k__cospi_m17_p15); |
michael@0 | 1026 | const __m128i out_15_3 = _mm_madd_epi16(out_17_1, k__cospi_m17_p15); |
michael@0 | 1027 | const __m128i out_31_2 = _mm_madd_epi16(out_01_0, k__cospi_m01_p31); |
michael@0 | 1028 | const __m128i out_31_3 = _mm_madd_epi16(out_01_1, k__cospi_m01_p31); |
michael@0 | 1029 | // dct_const_round_shift |
michael@0 | 1030 | const __m128i out_01_4 = _mm_add_epi32(out_01_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1031 | const __m128i out_01_5 = _mm_add_epi32(out_01_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1032 | const __m128i out_17_4 = _mm_add_epi32(out_17_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1033 | const __m128i out_17_5 = _mm_add_epi32(out_17_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1034 | const __m128i out_09_4 = _mm_add_epi32(out_09_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1035 | const __m128i out_09_5 = _mm_add_epi32(out_09_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1036 | const __m128i out_25_4 = _mm_add_epi32(out_25_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1037 | const __m128i out_25_5 = _mm_add_epi32(out_25_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1038 | const __m128i out_07_4 = _mm_add_epi32(out_07_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1039 | const __m128i out_07_5 = _mm_add_epi32(out_07_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1040 | const __m128i out_23_4 = _mm_add_epi32(out_23_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1041 | const __m128i out_23_5 = _mm_add_epi32(out_23_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1042 | const __m128i out_15_4 = _mm_add_epi32(out_15_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1043 | const __m128i out_15_5 = _mm_add_epi32(out_15_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1044 | const __m128i out_31_4 = _mm_add_epi32(out_31_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1045 | const __m128i out_31_5 = _mm_add_epi32(out_31_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1046 | const __m128i out_01_6 = _mm_srai_epi32(out_01_4, DCT_CONST_BITS); |
michael@0 | 1047 | const __m128i out_01_7 = _mm_srai_epi32(out_01_5, DCT_CONST_BITS); |
michael@0 | 1048 | const __m128i out_17_6 = _mm_srai_epi32(out_17_4, DCT_CONST_BITS); |
michael@0 | 1049 | const __m128i out_17_7 = _mm_srai_epi32(out_17_5, DCT_CONST_BITS); |
michael@0 | 1050 | const __m128i out_09_6 = _mm_srai_epi32(out_09_4, DCT_CONST_BITS); |
michael@0 | 1051 | const __m128i out_09_7 = _mm_srai_epi32(out_09_5, DCT_CONST_BITS); |
michael@0 | 1052 | const __m128i out_25_6 = _mm_srai_epi32(out_25_4, DCT_CONST_BITS); |
michael@0 | 1053 | const __m128i out_25_7 = _mm_srai_epi32(out_25_5, DCT_CONST_BITS); |
michael@0 | 1054 | const __m128i out_07_6 = _mm_srai_epi32(out_07_4, DCT_CONST_BITS); |
michael@0 | 1055 | const __m128i out_07_7 = _mm_srai_epi32(out_07_5, DCT_CONST_BITS); |
michael@0 | 1056 | const __m128i out_23_6 = _mm_srai_epi32(out_23_4, DCT_CONST_BITS); |
michael@0 | 1057 | const __m128i out_23_7 = _mm_srai_epi32(out_23_5, DCT_CONST_BITS); |
michael@0 | 1058 | const __m128i out_15_6 = _mm_srai_epi32(out_15_4, DCT_CONST_BITS); |
michael@0 | 1059 | const __m128i out_15_7 = _mm_srai_epi32(out_15_5, DCT_CONST_BITS); |
michael@0 | 1060 | const __m128i out_31_6 = _mm_srai_epi32(out_31_4, DCT_CONST_BITS); |
michael@0 | 1061 | const __m128i out_31_7 = _mm_srai_epi32(out_31_5, DCT_CONST_BITS); |
michael@0 | 1062 | // Combine |
michael@0 | 1063 | out[ 1] = _mm_packs_epi32(out_01_6, out_01_7); |
michael@0 | 1064 | out[17] = _mm_packs_epi32(out_17_6, out_17_7); |
michael@0 | 1065 | out[ 9] = _mm_packs_epi32(out_09_6, out_09_7); |
michael@0 | 1066 | out[25] = _mm_packs_epi32(out_25_6, out_25_7); |
michael@0 | 1067 | out[ 7] = _mm_packs_epi32(out_07_6, out_07_7); |
michael@0 | 1068 | out[23] = _mm_packs_epi32(out_23_6, out_23_7); |
michael@0 | 1069 | out[15] = _mm_packs_epi32(out_15_6, out_15_7); |
michael@0 | 1070 | out[31] = _mm_packs_epi32(out_31_6, out_31_7); |
michael@0 | 1071 | } |
michael@0 | 1072 | { |
michael@0 | 1073 | const __m128i out_05_0 = _mm_unpacklo_epi16(step1[20], step1[27]); |
michael@0 | 1074 | const __m128i out_05_1 = _mm_unpackhi_epi16(step1[20], step1[27]); |
michael@0 | 1075 | const __m128i out_21_0 = _mm_unpacklo_epi16(step1[21], step1[26]); |
michael@0 | 1076 | const __m128i out_21_1 = _mm_unpackhi_epi16(step1[21], step1[26]); |
michael@0 | 1077 | const __m128i out_13_0 = _mm_unpacklo_epi16(step1[22], step1[25]); |
michael@0 | 1078 | const __m128i out_13_1 = _mm_unpackhi_epi16(step1[22], step1[25]); |
michael@0 | 1079 | const __m128i out_29_0 = _mm_unpacklo_epi16(step1[23], step1[24]); |
michael@0 | 1080 | const __m128i out_29_1 = _mm_unpackhi_epi16(step1[23], step1[24]); |
michael@0 | 1081 | const __m128i out_05_2 = _mm_madd_epi16(out_05_0, k__cospi_p27_p05); |
michael@0 | 1082 | const __m128i out_05_3 = _mm_madd_epi16(out_05_1, k__cospi_p27_p05); |
michael@0 | 1083 | const __m128i out_21_2 = _mm_madd_epi16(out_21_0, k__cospi_p11_p21); |
michael@0 | 1084 | const __m128i out_21_3 = _mm_madd_epi16(out_21_1, k__cospi_p11_p21); |
michael@0 | 1085 | const __m128i out_13_2 = _mm_madd_epi16(out_13_0, k__cospi_p19_p13); |
michael@0 | 1086 | const __m128i out_13_3 = _mm_madd_epi16(out_13_1, k__cospi_p19_p13); |
michael@0 | 1087 | const __m128i out_29_2 = _mm_madd_epi16(out_29_0, k__cospi_p03_p29); |
michael@0 | 1088 | const __m128i out_29_3 = _mm_madd_epi16(out_29_1, k__cospi_p03_p29); |
michael@0 | 1089 | const __m128i out_03_2 = _mm_madd_epi16(out_29_0, k__cospi_m29_p03); |
michael@0 | 1090 | const __m128i out_03_3 = _mm_madd_epi16(out_29_1, k__cospi_m29_p03); |
michael@0 | 1091 | const __m128i out_19_2 = _mm_madd_epi16(out_13_0, k__cospi_m13_p19); |
michael@0 | 1092 | const __m128i out_19_3 = _mm_madd_epi16(out_13_1, k__cospi_m13_p19); |
michael@0 | 1093 | const __m128i out_11_2 = _mm_madd_epi16(out_21_0, k__cospi_m21_p11); |
michael@0 | 1094 | const __m128i out_11_3 = _mm_madd_epi16(out_21_1, k__cospi_m21_p11); |
michael@0 | 1095 | const __m128i out_27_2 = _mm_madd_epi16(out_05_0, k__cospi_m05_p27); |
michael@0 | 1096 | const __m128i out_27_3 = _mm_madd_epi16(out_05_1, k__cospi_m05_p27); |
michael@0 | 1097 | // dct_const_round_shift |
michael@0 | 1098 | const __m128i out_05_4 = _mm_add_epi32(out_05_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1099 | const __m128i out_05_5 = _mm_add_epi32(out_05_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1100 | const __m128i out_21_4 = _mm_add_epi32(out_21_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1101 | const __m128i out_21_5 = _mm_add_epi32(out_21_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1102 | const __m128i out_13_4 = _mm_add_epi32(out_13_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1103 | const __m128i out_13_5 = _mm_add_epi32(out_13_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1104 | const __m128i out_29_4 = _mm_add_epi32(out_29_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1105 | const __m128i out_29_5 = _mm_add_epi32(out_29_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1106 | const __m128i out_03_4 = _mm_add_epi32(out_03_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1107 | const __m128i out_03_5 = _mm_add_epi32(out_03_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1108 | const __m128i out_19_4 = _mm_add_epi32(out_19_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1109 | const __m128i out_19_5 = _mm_add_epi32(out_19_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1110 | const __m128i out_11_4 = _mm_add_epi32(out_11_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1111 | const __m128i out_11_5 = _mm_add_epi32(out_11_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1112 | const __m128i out_27_4 = _mm_add_epi32(out_27_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1113 | const __m128i out_27_5 = _mm_add_epi32(out_27_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1114 | const __m128i out_05_6 = _mm_srai_epi32(out_05_4, DCT_CONST_BITS); |
michael@0 | 1115 | const __m128i out_05_7 = _mm_srai_epi32(out_05_5, DCT_CONST_BITS); |
michael@0 | 1116 | const __m128i out_21_6 = _mm_srai_epi32(out_21_4, DCT_CONST_BITS); |
michael@0 | 1117 | const __m128i out_21_7 = _mm_srai_epi32(out_21_5, DCT_CONST_BITS); |
michael@0 | 1118 | const __m128i out_13_6 = _mm_srai_epi32(out_13_4, DCT_CONST_BITS); |
michael@0 | 1119 | const __m128i out_13_7 = _mm_srai_epi32(out_13_5, DCT_CONST_BITS); |
michael@0 | 1120 | const __m128i out_29_6 = _mm_srai_epi32(out_29_4, DCT_CONST_BITS); |
michael@0 | 1121 | const __m128i out_29_7 = _mm_srai_epi32(out_29_5, DCT_CONST_BITS); |
michael@0 | 1122 | const __m128i out_03_6 = _mm_srai_epi32(out_03_4, DCT_CONST_BITS); |
michael@0 | 1123 | const __m128i out_03_7 = _mm_srai_epi32(out_03_5, DCT_CONST_BITS); |
michael@0 | 1124 | const __m128i out_19_6 = _mm_srai_epi32(out_19_4, DCT_CONST_BITS); |
michael@0 | 1125 | const __m128i out_19_7 = _mm_srai_epi32(out_19_5, DCT_CONST_BITS); |
michael@0 | 1126 | const __m128i out_11_6 = _mm_srai_epi32(out_11_4, DCT_CONST_BITS); |
michael@0 | 1127 | const __m128i out_11_7 = _mm_srai_epi32(out_11_5, DCT_CONST_BITS); |
michael@0 | 1128 | const __m128i out_27_6 = _mm_srai_epi32(out_27_4, DCT_CONST_BITS); |
michael@0 | 1129 | const __m128i out_27_7 = _mm_srai_epi32(out_27_5, DCT_CONST_BITS); |
michael@0 | 1130 | // Combine |
michael@0 | 1131 | out[ 5] = _mm_packs_epi32(out_05_6, out_05_7); |
michael@0 | 1132 | out[21] = _mm_packs_epi32(out_21_6, out_21_7); |
michael@0 | 1133 | out[13] = _mm_packs_epi32(out_13_6, out_13_7); |
michael@0 | 1134 | out[29] = _mm_packs_epi32(out_29_6, out_29_7); |
michael@0 | 1135 | out[ 3] = _mm_packs_epi32(out_03_6, out_03_7); |
michael@0 | 1136 | out[19] = _mm_packs_epi32(out_19_6, out_19_7); |
michael@0 | 1137 | out[11] = _mm_packs_epi32(out_11_6, out_11_7); |
michael@0 | 1138 | out[27] = _mm_packs_epi32(out_27_6, out_27_7); |
michael@0 | 1139 | } |
michael@0 | 1140 | #if FDCT32x32_HIGH_PRECISION |
michael@0 | 1141 | } else { |
michael@0 | 1142 | __m128i lstep1[64], lstep2[64], lstep3[64]; |
michael@0 | 1143 | __m128i u[32], v[32], sign[16]; |
michael@0 | 1144 | const __m128i K32One = _mm_set_epi32(1, 1, 1, 1); |
michael@0 | 1145 | // start using 32-bit operations |
michael@0 | 1146 | // stage 3 |
michael@0 | 1147 | { |
michael@0 | 1148 | // expanding to 32-bit length priori to addition operations |
michael@0 | 1149 | lstep2[ 0] = _mm_unpacklo_epi16(step2[ 0], kZero); |
michael@0 | 1150 | lstep2[ 1] = _mm_unpackhi_epi16(step2[ 0], kZero); |
michael@0 | 1151 | lstep2[ 2] = _mm_unpacklo_epi16(step2[ 1], kZero); |
michael@0 | 1152 | lstep2[ 3] = _mm_unpackhi_epi16(step2[ 1], kZero); |
michael@0 | 1153 | lstep2[ 4] = _mm_unpacklo_epi16(step2[ 2], kZero); |
michael@0 | 1154 | lstep2[ 5] = _mm_unpackhi_epi16(step2[ 2], kZero); |
michael@0 | 1155 | lstep2[ 6] = _mm_unpacklo_epi16(step2[ 3], kZero); |
michael@0 | 1156 | lstep2[ 7] = _mm_unpackhi_epi16(step2[ 3], kZero); |
michael@0 | 1157 | lstep2[ 8] = _mm_unpacklo_epi16(step2[ 4], kZero); |
michael@0 | 1158 | lstep2[ 9] = _mm_unpackhi_epi16(step2[ 4], kZero); |
michael@0 | 1159 | lstep2[10] = _mm_unpacklo_epi16(step2[ 5], kZero); |
michael@0 | 1160 | lstep2[11] = _mm_unpackhi_epi16(step2[ 5], kZero); |
michael@0 | 1161 | lstep2[12] = _mm_unpacklo_epi16(step2[ 6], kZero); |
michael@0 | 1162 | lstep2[13] = _mm_unpackhi_epi16(step2[ 6], kZero); |
michael@0 | 1163 | lstep2[14] = _mm_unpacklo_epi16(step2[ 7], kZero); |
michael@0 | 1164 | lstep2[15] = _mm_unpackhi_epi16(step2[ 7], kZero); |
michael@0 | 1165 | lstep2[ 0] = _mm_madd_epi16(lstep2[ 0], kOne); |
michael@0 | 1166 | lstep2[ 1] = _mm_madd_epi16(lstep2[ 1], kOne); |
michael@0 | 1167 | lstep2[ 2] = _mm_madd_epi16(lstep2[ 2], kOne); |
michael@0 | 1168 | lstep2[ 3] = _mm_madd_epi16(lstep2[ 3], kOne); |
michael@0 | 1169 | lstep2[ 4] = _mm_madd_epi16(lstep2[ 4], kOne); |
michael@0 | 1170 | lstep2[ 5] = _mm_madd_epi16(lstep2[ 5], kOne); |
michael@0 | 1171 | lstep2[ 6] = _mm_madd_epi16(lstep2[ 6], kOne); |
michael@0 | 1172 | lstep2[ 7] = _mm_madd_epi16(lstep2[ 7], kOne); |
michael@0 | 1173 | lstep2[ 8] = _mm_madd_epi16(lstep2[ 8], kOne); |
michael@0 | 1174 | lstep2[ 9] = _mm_madd_epi16(lstep2[ 9], kOne); |
michael@0 | 1175 | lstep2[10] = _mm_madd_epi16(lstep2[10], kOne); |
michael@0 | 1176 | lstep2[11] = _mm_madd_epi16(lstep2[11], kOne); |
michael@0 | 1177 | lstep2[12] = _mm_madd_epi16(lstep2[12], kOne); |
michael@0 | 1178 | lstep2[13] = _mm_madd_epi16(lstep2[13], kOne); |
michael@0 | 1179 | lstep2[14] = _mm_madd_epi16(lstep2[14], kOne); |
michael@0 | 1180 | lstep2[15] = _mm_madd_epi16(lstep2[15], kOne); |
michael@0 | 1181 | |
michael@0 | 1182 | lstep3[ 0] = _mm_add_epi32(lstep2[14], lstep2[ 0]); |
michael@0 | 1183 | lstep3[ 1] = _mm_add_epi32(lstep2[15], lstep2[ 1]); |
michael@0 | 1184 | lstep3[ 2] = _mm_add_epi32(lstep2[12], lstep2[ 2]); |
michael@0 | 1185 | lstep3[ 3] = _mm_add_epi32(lstep2[13], lstep2[ 3]); |
michael@0 | 1186 | lstep3[ 4] = _mm_add_epi32(lstep2[10], lstep2[ 4]); |
michael@0 | 1187 | lstep3[ 5] = _mm_add_epi32(lstep2[11], lstep2[ 5]); |
michael@0 | 1188 | lstep3[ 6] = _mm_add_epi32(lstep2[ 8], lstep2[ 6]); |
michael@0 | 1189 | lstep3[ 7] = _mm_add_epi32(lstep2[ 9], lstep2[ 7]); |
michael@0 | 1190 | lstep3[ 8] = _mm_sub_epi32(lstep2[ 6], lstep2[ 8]); |
michael@0 | 1191 | lstep3[ 9] = _mm_sub_epi32(lstep2[ 7], lstep2[ 9]); |
michael@0 | 1192 | lstep3[10] = _mm_sub_epi32(lstep2[ 4], lstep2[10]); |
michael@0 | 1193 | lstep3[11] = _mm_sub_epi32(lstep2[ 5], lstep2[11]); |
michael@0 | 1194 | lstep3[12] = _mm_sub_epi32(lstep2[ 2], lstep2[12]); |
michael@0 | 1195 | lstep3[13] = _mm_sub_epi32(lstep2[ 3], lstep2[13]); |
michael@0 | 1196 | lstep3[14] = _mm_sub_epi32(lstep2[ 0], lstep2[14]); |
michael@0 | 1197 | lstep3[15] = _mm_sub_epi32(lstep2[ 1], lstep2[15]); |
michael@0 | 1198 | } |
michael@0 | 1199 | { |
michael@0 | 1200 | const __m128i s3_10_0 = _mm_unpacklo_epi16(step2[13], step2[10]); |
michael@0 | 1201 | const __m128i s3_10_1 = _mm_unpackhi_epi16(step2[13], step2[10]); |
michael@0 | 1202 | const __m128i s3_11_0 = _mm_unpacklo_epi16(step2[12], step2[11]); |
michael@0 | 1203 | const __m128i s3_11_1 = _mm_unpackhi_epi16(step2[12], step2[11]); |
michael@0 | 1204 | const __m128i s3_10_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_m16); |
michael@0 | 1205 | const __m128i s3_10_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_m16); |
michael@0 | 1206 | const __m128i s3_11_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_m16); |
michael@0 | 1207 | const __m128i s3_11_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_m16); |
michael@0 | 1208 | const __m128i s3_12_2 = _mm_madd_epi16(s3_11_0, k__cospi_p16_p16); |
michael@0 | 1209 | const __m128i s3_12_3 = _mm_madd_epi16(s3_11_1, k__cospi_p16_p16); |
michael@0 | 1210 | const __m128i s3_13_2 = _mm_madd_epi16(s3_10_0, k__cospi_p16_p16); |
michael@0 | 1211 | const __m128i s3_13_3 = _mm_madd_epi16(s3_10_1, k__cospi_p16_p16); |
michael@0 | 1212 | // dct_const_round_shift |
michael@0 | 1213 | const __m128i s3_10_4 = _mm_add_epi32(s3_10_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1214 | const __m128i s3_10_5 = _mm_add_epi32(s3_10_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1215 | const __m128i s3_11_4 = _mm_add_epi32(s3_11_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1216 | const __m128i s3_11_5 = _mm_add_epi32(s3_11_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1217 | const __m128i s3_12_4 = _mm_add_epi32(s3_12_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1218 | const __m128i s3_12_5 = _mm_add_epi32(s3_12_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1219 | const __m128i s3_13_4 = _mm_add_epi32(s3_13_2, k__DCT_CONST_ROUNDING); |
michael@0 | 1220 | const __m128i s3_13_5 = _mm_add_epi32(s3_13_3, k__DCT_CONST_ROUNDING); |
michael@0 | 1221 | lstep3[20] = _mm_srai_epi32(s3_10_4, DCT_CONST_BITS); |
michael@0 | 1222 | lstep3[21] = _mm_srai_epi32(s3_10_5, DCT_CONST_BITS); |
michael@0 | 1223 | lstep3[22] = _mm_srai_epi32(s3_11_4, DCT_CONST_BITS); |
michael@0 | 1224 | lstep3[23] = _mm_srai_epi32(s3_11_5, DCT_CONST_BITS); |
michael@0 | 1225 | lstep3[24] = _mm_srai_epi32(s3_12_4, DCT_CONST_BITS); |
michael@0 | 1226 | lstep3[25] = _mm_srai_epi32(s3_12_5, DCT_CONST_BITS); |
michael@0 | 1227 | lstep3[26] = _mm_srai_epi32(s3_13_4, DCT_CONST_BITS); |
michael@0 | 1228 | lstep3[27] = _mm_srai_epi32(s3_13_5, DCT_CONST_BITS); |
michael@0 | 1229 | } |
michael@0 | 1230 | { |
michael@0 | 1231 | lstep2[40] = _mm_unpacklo_epi16(step2[20], kZero); |
michael@0 | 1232 | lstep2[41] = _mm_unpackhi_epi16(step2[20], kZero); |
michael@0 | 1233 | lstep2[42] = _mm_unpacklo_epi16(step2[21], kZero); |
michael@0 | 1234 | lstep2[43] = _mm_unpackhi_epi16(step2[21], kZero); |
michael@0 | 1235 | lstep2[44] = _mm_unpacklo_epi16(step2[22], kZero); |
michael@0 | 1236 | lstep2[45] = _mm_unpackhi_epi16(step2[22], kZero); |
michael@0 | 1237 | lstep2[46] = _mm_unpacklo_epi16(step2[23], kZero); |
michael@0 | 1238 | lstep2[47] = _mm_unpackhi_epi16(step2[23], kZero); |
michael@0 | 1239 | lstep2[48] = _mm_unpacklo_epi16(step2[24], kZero); |
michael@0 | 1240 | lstep2[49] = _mm_unpackhi_epi16(step2[24], kZero); |
michael@0 | 1241 | lstep2[50] = _mm_unpacklo_epi16(step2[25], kZero); |
michael@0 | 1242 | lstep2[51] = _mm_unpackhi_epi16(step2[25], kZero); |
michael@0 | 1243 | lstep2[52] = _mm_unpacklo_epi16(step2[26], kZero); |
michael@0 | 1244 | lstep2[53] = _mm_unpackhi_epi16(step2[26], kZero); |
michael@0 | 1245 | lstep2[54] = _mm_unpacklo_epi16(step2[27], kZero); |
michael@0 | 1246 | lstep2[55] = _mm_unpackhi_epi16(step2[27], kZero); |
michael@0 | 1247 | lstep2[40] = _mm_madd_epi16(lstep2[40], kOne); |
michael@0 | 1248 | lstep2[41] = _mm_madd_epi16(lstep2[41], kOne); |
michael@0 | 1249 | lstep2[42] = _mm_madd_epi16(lstep2[42], kOne); |
michael@0 | 1250 | lstep2[43] = _mm_madd_epi16(lstep2[43], kOne); |
michael@0 | 1251 | lstep2[44] = _mm_madd_epi16(lstep2[44], kOne); |
michael@0 | 1252 | lstep2[45] = _mm_madd_epi16(lstep2[45], kOne); |
michael@0 | 1253 | lstep2[46] = _mm_madd_epi16(lstep2[46], kOne); |
michael@0 | 1254 | lstep2[47] = _mm_madd_epi16(lstep2[47], kOne); |
michael@0 | 1255 | lstep2[48] = _mm_madd_epi16(lstep2[48], kOne); |
michael@0 | 1256 | lstep2[49] = _mm_madd_epi16(lstep2[49], kOne); |
michael@0 | 1257 | lstep2[50] = _mm_madd_epi16(lstep2[50], kOne); |
michael@0 | 1258 | lstep2[51] = _mm_madd_epi16(lstep2[51], kOne); |
michael@0 | 1259 | lstep2[52] = _mm_madd_epi16(lstep2[52], kOne); |
michael@0 | 1260 | lstep2[53] = _mm_madd_epi16(lstep2[53], kOne); |
michael@0 | 1261 | lstep2[54] = _mm_madd_epi16(lstep2[54], kOne); |
michael@0 | 1262 | lstep2[55] = _mm_madd_epi16(lstep2[55], kOne); |
michael@0 | 1263 | |
michael@0 | 1264 | lstep1[32] = _mm_unpacklo_epi16(step1[16], kZero); |
michael@0 | 1265 | lstep1[33] = _mm_unpackhi_epi16(step1[16], kZero); |
michael@0 | 1266 | lstep1[34] = _mm_unpacklo_epi16(step1[17], kZero); |
michael@0 | 1267 | lstep1[35] = _mm_unpackhi_epi16(step1[17], kZero); |
michael@0 | 1268 | lstep1[36] = _mm_unpacklo_epi16(step1[18], kZero); |
michael@0 | 1269 | lstep1[37] = _mm_unpackhi_epi16(step1[18], kZero); |
michael@0 | 1270 | lstep1[38] = _mm_unpacklo_epi16(step1[19], kZero); |
michael@0 | 1271 | lstep1[39] = _mm_unpackhi_epi16(step1[19], kZero); |
michael@0 | 1272 | lstep1[56] = _mm_unpacklo_epi16(step1[28], kZero); |
michael@0 | 1273 | lstep1[57] = _mm_unpackhi_epi16(step1[28], kZero); |
michael@0 | 1274 | lstep1[58] = _mm_unpacklo_epi16(step1[29], kZero); |
michael@0 | 1275 | lstep1[59] = _mm_unpackhi_epi16(step1[29], kZero); |
michael@0 | 1276 | lstep1[60] = _mm_unpacklo_epi16(step1[30], kZero); |
michael@0 | 1277 | lstep1[61] = _mm_unpackhi_epi16(step1[30], kZero); |
michael@0 | 1278 | lstep1[62] = _mm_unpacklo_epi16(step1[31], kZero); |
michael@0 | 1279 | lstep1[63] = _mm_unpackhi_epi16(step1[31], kZero); |
michael@0 | 1280 | lstep1[32] = _mm_madd_epi16(lstep1[32], kOne); |
michael@0 | 1281 | lstep1[33] = _mm_madd_epi16(lstep1[33], kOne); |
michael@0 | 1282 | lstep1[34] = _mm_madd_epi16(lstep1[34], kOne); |
michael@0 | 1283 | lstep1[35] = _mm_madd_epi16(lstep1[35], kOne); |
michael@0 | 1284 | lstep1[36] = _mm_madd_epi16(lstep1[36], kOne); |
michael@0 | 1285 | lstep1[37] = _mm_madd_epi16(lstep1[37], kOne); |
michael@0 | 1286 | lstep1[38] = _mm_madd_epi16(lstep1[38], kOne); |
michael@0 | 1287 | lstep1[39] = _mm_madd_epi16(lstep1[39], kOne); |
michael@0 | 1288 | lstep1[56] = _mm_madd_epi16(lstep1[56], kOne); |
michael@0 | 1289 | lstep1[57] = _mm_madd_epi16(lstep1[57], kOne); |
michael@0 | 1290 | lstep1[58] = _mm_madd_epi16(lstep1[58], kOne); |
michael@0 | 1291 | lstep1[59] = _mm_madd_epi16(lstep1[59], kOne); |
michael@0 | 1292 | lstep1[60] = _mm_madd_epi16(lstep1[60], kOne); |
michael@0 | 1293 | lstep1[61] = _mm_madd_epi16(lstep1[61], kOne); |
michael@0 | 1294 | lstep1[62] = _mm_madd_epi16(lstep1[62], kOne); |
michael@0 | 1295 | lstep1[63] = _mm_madd_epi16(lstep1[63], kOne); |
michael@0 | 1296 | |
michael@0 | 1297 | lstep3[32] = _mm_add_epi32(lstep2[46], lstep1[32]); |
michael@0 | 1298 | lstep3[33] = _mm_add_epi32(lstep2[47], lstep1[33]); |
michael@0 | 1299 | |
michael@0 | 1300 | lstep3[34] = _mm_add_epi32(lstep2[44], lstep1[34]); |
michael@0 | 1301 | lstep3[35] = _mm_add_epi32(lstep2[45], lstep1[35]); |
michael@0 | 1302 | lstep3[36] = _mm_add_epi32(lstep2[42], lstep1[36]); |
michael@0 | 1303 | lstep3[37] = _mm_add_epi32(lstep2[43], lstep1[37]); |
michael@0 | 1304 | lstep3[38] = _mm_add_epi32(lstep2[40], lstep1[38]); |
michael@0 | 1305 | lstep3[39] = _mm_add_epi32(lstep2[41], lstep1[39]); |
michael@0 | 1306 | lstep3[40] = _mm_sub_epi32(lstep1[38], lstep2[40]); |
michael@0 | 1307 | lstep3[41] = _mm_sub_epi32(lstep1[39], lstep2[41]); |
michael@0 | 1308 | lstep3[42] = _mm_sub_epi32(lstep1[36], lstep2[42]); |
michael@0 | 1309 | lstep3[43] = _mm_sub_epi32(lstep1[37], lstep2[43]); |
michael@0 | 1310 | lstep3[44] = _mm_sub_epi32(lstep1[34], lstep2[44]); |
michael@0 | 1311 | lstep3[45] = _mm_sub_epi32(lstep1[35], lstep2[45]); |
michael@0 | 1312 | lstep3[46] = _mm_sub_epi32(lstep1[32], lstep2[46]); |
michael@0 | 1313 | lstep3[47] = _mm_sub_epi32(lstep1[33], lstep2[47]); |
michael@0 | 1314 | lstep3[48] = _mm_sub_epi32(lstep1[62], lstep2[48]); |
michael@0 | 1315 | lstep3[49] = _mm_sub_epi32(lstep1[63], lstep2[49]); |
michael@0 | 1316 | lstep3[50] = _mm_sub_epi32(lstep1[60], lstep2[50]); |
michael@0 | 1317 | lstep3[51] = _mm_sub_epi32(lstep1[61], lstep2[51]); |
michael@0 | 1318 | lstep3[52] = _mm_sub_epi32(lstep1[58], lstep2[52]); |
michael@0 | 1319 | lstep3[53] = _mm_sub_epi32(lstep1[59], lstep2[53]); |
michael@0 | 1320 | lstep3[54] = _mm_sub_epi32(lstep1[56], lstep2[54]); |
michael@0 | 1321 | lstep3[55] = _mm_sub_epi32(lstep1[57], lstep2[55]); |
michael@0 | 1322 | lstep3[56] = _mm_add_epi32(lstep2[54], lstep1[56]); |
michael@0 | 1323 | lstep3[57] = _mm_add_epi32(lstep2[55], lstep1[57]); |
michael@0 | 1324 | lstep3[58] = _mm_add_epi32(lstep2[52], lstep1[58]); |
michael@0 | 1325 | lstep3[59] = _mm_add_epi32(lstep2[53], lstep1[59]); |
michael@0 | 1326 | lstep3[60] = _mm_add_epi32(lstep2[50], lstep1[60]); |
michael@0 | 1327 | lstep3[61] = _mm_add_epi32(lstep2[51], lstep1[61]); |
michael@0 | 1328 | lstep3[62] = _mm_add_epi32(lstep2[48], lstep1[62]); |
michael@0 | 1329 | lstep3[63] = _mm_add_epi32(lstep2[49], lstep1[63]); |
michael@0 | 1330 | } |
michael@0 | 1331 | |
michael@0 | 1332 | // stage 4 |
michael@0 | 1333 | { |
michael@0 | 1334 | // expanding to 32-bit length priori to addition operations |
michael@0 | 1335 | lstep2[16] = _mm_unpacklo_epi16(step2[ 8], kZero); |
michael@0 | 1336 | lstep2[17] = _mm_unpackhi_epi16(step2[ 8], kZero); |
michael@0 | 1337 | lstep2[18] = _mm_unpacklo_epi16(step2[ 9], kZero); |
michael@0 | 1338 | lstep2[19] = _mm_unpackhi_epi16(step2[ 9], kZero); |
michael@0 | 1339 | lstep2[28] = _mm_unpacklo_epi16(step2[14], kZero); |
michael@0 | 1340 | lstep2[29] = _mm_unpackhi_epi16(step2[14], kZero); |
michael@0 | 1341 | lstep2[30] = _mm_unpacklo_epi16(step2[15], kZero); |
michael@0 | 1342 | lstep2[31] = _mm_unpackhi_epi16(step2[15], kZero); |
michael@0 | 1343 | lstep2[16] = _mm_madd_epi16(lstep2[16], kOne); |
michael@0 | 1344 | lstep2[17] = _mm_madd_epi16(lstep2[17], kOne); |
michael@0 | 1345 | lstep2[18] = _mm_madd_epi16(lstep2[18], kOne); |
michael@0 | 1346 | lstep2[19] = _mm_madd_epi16(lstep2[19], kOne); |
michael@0 | 1347 | lstep2[28] = _mm_madd_epi16(lstep2[28], kOne); |
michael@0 | 1348 | lstep2[29] = _mm_madd_epi16(lstep2[29], kOne); |
michael@0 | 1349 | lstep2[30] = _mm_madd_epi16(lstep2[30], kOne); |
michael@0 | 1350 | lstep2[31] = _mm_madd_epi16(lstep2[31], kOne); |
michael@0 | 1351 | |
michael@0 | 1352 | lstep1[ 0] = _mm_add_epi32(lstep3[ 6], lstep3[ 0]); |
michael@0 | 1353 | lstep1[ 1] = _mm_add_epi32(lstep3[ 7], lstep3[ 1]); |
michael@0 | 1354 | lstep1[ 2] = _mm_add_epi32(lstep3[ 4], lstep3[ 2]); |
michael@0 | 1355 | lstep1[ 3] = _mm_add_epi32(lstep3[ 5], lstep3[ 3]); |
michael@0 | 1356 | lstep1[ 4] = _mm_sub_epi32(lstep3[ 2], lstep3[ 4]); |
michael@0 | 1357 | lstep1[ 5] = _mm_sub_epi32(lstep3[ 3], lstep3[ 5]); |
michael@0 | 1358 | lstep1[ 6] = _mm_sub_epi32(lstep3[ 0], lstep3[ 6]); |
michael@0 | 1359 | lstep1[ 7] = _mm_sub_epi32(lstep3[ 1], lstep3[ 7]); |
michael@0 | 1360 | lstep1[16] = _mm_add_epi32(lstep3[22], lstep2[16]); |
michael@0 | 1361 | lstep1[17] = _mm_add_epi32(lstep3[23], lstep2[17]); |
michael@0 | 1362 | lstep1[18] = _mm_add_epi32(lstep3[20], lstep2[18]); |
michael@0 | 1363 | lstep1[19] = _mm_add_epi32(lstep3[21], lstep2[19]); |
michael@0 | 1364 | lstep1[20] = _mm_sub_epi32(lstep2[18], lstep3[20]); |
michael@0 | 1365 | lstep1[21] = _mm_sub_epi32(lstep2[19], lstep3[21]); |
michael@0 | 1366 | lstep1[22] = _mm_sub_epi32(lstep2[16], lstep3[22]); |
michael@0 | 1367 | lstep1[23] = _mm_sub_epi32(lstep2[17], lstep3[23]); |
michael@0 | 1368 | lstep1[24] = _mm_sub_epi32(lstep2[30], lstep3[24]); |
michael@0 | 1369 | lstep1[25] = _mm_sub_epi32(lstep2[31], lstep3[25]); |
michael@0 | 1370 | lstep1[26] = _mm_sub_epi32(lstep2[28], lstep3[26]); |
michael@0 | 1371 | lstep1[27] = _mm_sub_epi32(lstep2[29], lstep3[27]); |
michael@0 | 1372 | lstep1[28] = _mm_add_epi32(lstep3[26], lstep2[28]); |
michael@0 | 1373 | lstep1[29] = _mm_add_epi32(lstep3[27], lstep2[29]); |
michael@0 | 1374 | lstep1[30] = _mm_add_epi32(lstep3[24], lstep2[30]); |
michael@0 | 1375 | lstep1[31] = _mm_add_epi32(lstep3[25], lstep2[31]); |
michael@0 | 1376 | } |
michael@0 | 1377 | { |
michael@0 | 1378 | // to be continued... |
michael@0 | 1379 | // |
michael@0 | 1380 | const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); |
michael@0 | 1381 | const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); |
michael@0 | 1382 | |
michael@0 | 1383 | u[0] = _mm_unpacklo_epi32(lstep3[12], lstep3[10]); |
michael@0 | 1384 | u[1] = _mm_unpackhi_epi32(lstep3[12], lstep3[10]); |
michael@0 | 1385 | u[2] = _mm_unpacklo_epi32(lstep3[13], lstep3[11]); |
michael@0 | 1386 | u[3] = _mm_unpackhi_epi32(lstep3[13], lstep3[11]); |
michael@0 | 1387 | |
michael@0 | 1388 | // TODO(jingning): manually inline k_madd_epi32_ to further hide |
michael@0 | 1389 | // instruction latency. |
michael@0 | 1390 | v[ 0] = k_madd_epi32(u[0], k32_p16_m16); |
michael@0 | 1391 | v[ 1] = k_madd_epi32(u[1], k32_p16_m16); |
michael@0 | 1392 | v[ 2] = k_madd_epi32(u[2], k32_p16_m16); |
michael@0 | 1393 | v[ 3] = k_madd_epi32(u[3], k32_p16_m16); |
michael@0 | 1394 | v[ 4] = k_madd_epi32(u[0], k32_p16_p16); |
michael@0 | 1395 | v[ 5] = k_madd_epi32(u[1], k32_p16_p16); |
michael@0 | 1396 | v[ 6] = k_madd_epi32(u[2], k32_p16_p16); |
michael@0 | 1397 | v[ 7] = k_madd_epi32(u[3], k32_p16_p16); |
michael@0 | 1398 | |
michael@0 | 1399 | u[0] = k_packs_epi64(v[0], v[1]); |
michael@0 | 1400 | u[1] = k_packs_epi64(v[2], v[3]); |
michael@0 | 1401 | u[2] = k_packs_epi64(v[4], v[5]); |
michael@0 | 1402 | u[3] = k_packs_epi64(v[6], v[7]); |
michael@0 | 1403 | |
michael@0 | 1404 | v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); |
michael@0 | 1405 | v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); |
michael@0 | 1406 | v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); |
michael@0 | 1407 | v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); |
michael@0 | 1408 | |
michael@0 | 1409 | lstep1[10] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
michael@0 | 1410 | lstep1[11] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
michael@0 | 1411 | lstep1[12] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
michael@0 | 1412 | lstep1[13] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
michael@0 | 1413 | } |
michael@0 | 1414 | { |
michael@0 | 1415 | const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); |
michael@0 | 1416 | const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); |
michael@0 | 1417 | const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); |
michael@0 | 1418 | |
michael@0 | 1419 | u[ 0] = _mm_unpacklo_epi32(lstep3[36], lstep3[58]); |
michael@0 | 1420 | u[ 1] = _mm_unpackhi_epi32(lstep3[36], lstep3[58]); |
michael@0 | 1421 | u[ 2] = _mm_unpacklo_epi32(lstep3[37], lstep3[59]); |
michael@0 | 1422 | u[ 3] = _mm_unpackhi_epi32(lstep3[37], lstep3[59]); |
michael@0 | 1423 | u[ 4] = _mm_unpacklo_epi32(lstep3[38], lstep3[56]); |
michael@0 | 1424 | u[ 5] = _mm_unpackhi_epi32(lstep3[38], lstep3[56]); |
michael@0 | 1425 | u[ 6] = _mm_unpacklo_epi32(lstep3[39], lstep3[57]); |
michael@0 | 1426 | u[ 7] = _mm_unpackhi_epi32(lstep3[39], lstep3[57]); |
michael@0 | 1427 | u[ 8] = _mm_unpacklo_epi32(lstep3[40], lstep3[54]); |
michael@0 | 1428 | u[ 9] = _mm_unpackhi_epi32(lstep3[40], lstep3[54]); |
michael@0 | 1429 | u[10] = _mm_unpacklo_epi32(lstep3[41], lstep3[55]); |
michael@0 | 1430 | u[11] = _mm_unpackhi_epi32(lstep3[41], lstep3[55]); |
michael@0 | 1431 | u[12] = _mm_unpacklo_epi32(lstep3[42], lstep3[52]); |
michael@0 | 1432 | u[13] = _mm_unpackhi_epi32(lstep3[42], lstep3[52]); |
michael@0 | 1433 | u[14] = _mm_unpacklo_epi32(lstep3[43], lstep3[53]); |
michael@0 | 1434 | u[15] = _mm_unpackhi_epi32(lstep3[43], lstep3[53]); |
michael@0 | 1435 | |
michael@0 | 1436 | v[ 0] = k_madd_epi32(u[ 0], k32_m08_p24); |
michael@0 | 1437 | v[ 1] = k_madd_epi32(u[ 1], k32_m08_p24); |
michael@0 | 1438 | v[ 2] = k_madd_epi32(u[ 2], k32_m08_p24); |
michael@0 | 1439 | v[ 3] = k_madd_epi32(u[ 3], k32_m08_p24); |
michael@0 | 1440 | v[ 4] = k_madd_epi32(u[ 4], k32_m08_p24); |
michael@0 | 1441 | v[ 5] = k_madd_epi32(u[ 5], k32_m08_p24); |
michael@0 | 1442 | v[ 6] = k_madd_epi32(u[ 6], k32_m08_p24); |
michael@0 | 1443 | v[ 7] = k_madd_epi32(u[ 7], k32_m08_p24); |
michael@0 | 1444 | v[ 8] = k_madd_epi32(u[ 8], k32_m24_m08); |
michael@0 | 1445 | v[ 9] = k_madd_epi32(u[ 9], k32_m24_m08); |
michael@0 | 1446 | v[10] = k_madd_epi32(u[10], k32_m24_m08); |
michael@0 | 1447 | v[11] = k_madd_epi32(u[11], k32_m24_m08); |
michael@0 | 1448 | v[12] = k_madd_epi32(u[12], k32_m24_m08); |
michael@0 | 1449 | v[13] = k_madd_epi32(u[13], k32_m24_m08); |
michael@0 | 1450 | v[14] = k_madd_epi32(u[14], k32_m24_m08); |
michael@0 | 1451 | v[15] = k_madd_epi32(u[15], k32_m24_m08); |
michael@0 | 1452 | v[16] = k_madd_epi32(u[12], k32_m08_p24); |
michael@0 | 1453 | v[17] = k_madd_epi32(u[13], k32_m08_p24); |
michael@0 | 1454 | v[18] = k_madd_epi32(u[14], k32_m08_p24); |
michael@0 | 1455 | v[19] = k_madd_epi32(u[15], k32_m08_p24); |
michael@0 | 1456 | v[20] = k_madd_epi32(u[ 8], k32_m08_p24); |
michael@0 | 1457 | v[21] = k_madd_epi32(u[ 9], k32_m08_p24); |
michael@0 | 1458 | v[22] = k_madd_epi32(u[10], k32_m08_p24); |
michael@0 | 1459 | v[23] = k_madd_epi32(u[11], k32_m08_p24); |
michael@0 | 1460 | v[24] = k_madd_epi32(u[ 4], k32_p24_p08); |
michael@0 | 1461 | v[25] = k_madd_epi32(u[ 5], k32_p24_p08); |
michael@0 | 1462 | v[26] = k_madd_epi32(u[ 6], k32_p24_p08); |
michael@0 | 1463 | v[27] = k_madd_epi32(u[ 7], k32_p24_p08); |
michael@0 | 1464 | v[28] = k_madd_epi32(u[ 0], k32_p24_p08); |
michael@0 | 1465 | v[29] = k_madd_epi32(u[ 1], k32_p24_p08); |
michael@0 | 1466 | v[30] = k_madd_epi32(u[ 2], k32_p24_p08); |
michael@0 | 1467 | v[31] = k_madd_epi32(u[ 3], k32_p24_p08); |
michael@0 | 1468 | |
michael@0 | 1469 | u[ 0] = k_packs_epi64(v[ 0], v[ 1]); |
michael@0 | 1470 | u[ 1] = k_packs_epi64(v[ 2], v[ 3]); |
michael@0 | 1471 | u[ 2] = k_packs_epi64(v[ 4], v[ 5]); |
michael@0 | 1472 | u[ 3] = k_packs_epi64(v[ 6], v[ 7]); |
michael@0 | 1473 | u[ 4] = k_packs_epi64(v[ 8], v[ 9]); |
michael@0 | 1474 | u[ 5] = k_packs_epi64(v[10], v[11]); |
michael@0 | 1475 | u[ 6] = k_packs_epi64(v[12], v[13]); |
michael@0 | 1476 | u[ 7] = k_packs_epi64(v[14], v[15]); |
michael@0 | 1477 | u[ 8] = k_packs_epi64(v[16], v[17]); |
michael@0 | 1478 | u[ 9] = k_packs_epi64(v[18], v[19]); |
michael@0 | 1479 | u[10] = k_packs_epi64(v[20], v[21]); |
michael@0 | 1480 | u[11] = k_packs_epi64(v[22], v[23]); |
michael@0 | 1481 | u[12] = k_packs_epi64(v[24], v[25]); |
michael@0 | 1482 | u[13] = k_packs_epi64(v[26], v[27]); |
michael@0 | 1483 | u[14] = k_packs_epi64(v[28], v[29]); |
michael@0 | 1484 | u[15] = k_packs_epi64(v[30], v[31]); |
michael@0 | 1485 | |
michael@0 | 1486 | v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); |
michael@0 | 1487 | v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); |
michael@0 | 1488 | v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); |
michael@0 | 1489 | v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); |
michael@0 | 1490 | v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); |
michael@0 | 1491 | v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); |
michael@0 | 1492 | v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); |
michael@0 | 1493 | v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); |
michael@0 | 1494 | v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); |
michael@0 | 1495 | v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); |
michael@0 | 1496 | v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); |
michael@0 | 1497 | v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); |
michael@0 | 1498 | v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); |
michael@0 | 1499 | v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); |
michael@0 | 1500 | v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); |
michael@0 | 1501 | v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); |
michael@0 | 1502 | |
michael@0 | 1503 | lstep1[36] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); |
michael@0 | 1504 | lstep1[37] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); |
michael@0 | 1505 | lstep1[38] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); |
michael@0 | 1506 | lstep1[39] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); |
michael@0 | 1507 | lstep1[40] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); |
michael@0 | 1508 | lstep1[41] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); |
michael@0 | 1509 | lstep1[42] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); |
michael@0 | 1510 | lstep1[43] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); |
michael@0 | 1511 | lstep1[52] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); |
michael@0 | 1512 | lstep1[53] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); |
michael@0 | 1513 | lstep1[54] = _mm_srai_epi32(v[10], DCT_CONST_BITS); |
michael@0 | 1514 | lstep1[55] = _mm_srai_epi32(v[11], DCT_CONST_BITS); |
michael@0 | 1515 | lstep1[56] = _mm_srai_epi32(v[12], DCT_CONST_BITS); |
michael@0 | 1516 | lstep1[57] = _mm_srai_epi32(v[13], DCT_CONST_BITS); |
michael@0 | 1517 | lstep1[58] = _mm_srai_epi32(v[14], DCT_CONST_BITS); |
michael@0 | 1518 | lstep1[59] = _mm_srai_epi32(v[15], DCT_CONST_BITS); |
michael@0 | 1519 | } |
michael@0 | 1520 | // stage 5 |
michael@0 | 1521 | { |
michael@0 | 1522 | lstep2[ 8] = _mm_add_epi32(lstep1[10], lstep3[ 8]); |
michael@0 | 1523 | lstep2[ 9] = _mm_add_epi32(lstep1[11], lstep3[ 9]); |
michael@0 | 1524 | lstep2[10] = _mm_sub_epi32(lstep3[ 8], lstep1[10]); |
michael@0 | 1525 | lstep2[11] = _mm_sub_epi32(lstep3[ 9], lstep1[11]); |
michael@0 | 1526 | lstep2[12] = _mm_sub_epi32(lstep3[14], lstep1[12]); |
michael@0 | 1527 | lstep2[13] = _mm_sub_epi32(lstep3[15], lstep1[13]); |
michael@0 | 1528 | lstep2[14] = _mm_add_epi32(lstep1[12], lstep3[14]); |
michael@0 | 1529 | lstep2[15] = _mm_add_epi32(lstep1[13], lstep3[15]); |
michael@0 | 1530 | } |
michael@0 | 1531 | { |
michael@0 | 1532 | const __m128i k32_p16_p16 = pair_set_epi32(cospi_16_64, cospi_16_64); |
michael@0 | 1533 | const __m128i k32_p16_m16 = pair_set_epi32(cospi_16_64, -cospi_16_64); |
michael@0 | 1534 | const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); |
michael@0 | 1535 | const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); |
michael@0 | 1536 | |
michael@0 | 1537 | u[0] = _mm_unpacklo_epi32(lstep1[0], lstep1[2]); |
michael@0 | 1538 | u[1] = _mm_unpackhi_epi32(lstep1[0], lstep1[2]); |
michael@0 | 1539 | u[2] = _mm_unpacklo_epi32(lstep1[1], lstep1[3]); |
michael@0 | 1540 | u[3] = _mm_unpackhi_epi32(lstep1[1], lstep1[3]); |
michael@0 | 1541 | u[4] = _mm_unpacklo_epi32(lstep1[4], lstep1[6]); |
michael@0 | 1542 | u[5] = _mm_unpackhi_epi32(lstep1[4], lstep1[6]); |
michael@0 | 1543 | u[6] = _mm_unpacklo_epi32(lstep1[5], lstep1[7]); |
michael@0 | 1544 | u[7] = _mm_unpackhi_epi32(lstep1[5], lstep1[7]); |
michael@0 | 1545 | |
michael@0 | 1546 | // TODO(jingning): manually inline k_madd_epi32_ to further hide |
michael@0 | 1547 | // instruction latency. |
michael@0 | 1548 | v[ 0] = k_madd_epi32(u[0], k32_p16_p16); |
michael@0 | 1549 | v[ 1] = k_madd_epi32(u[1], k32_p16_p16); |
michael@0 | 1550 | v[ 2] = k_madd_epi32(u[2], k32_p16_p16); |
michael@0 | 1551 | v[ 3] = k_madd_epi32(u[3], k32_p16_p16); |
michael@0 | 1552 | v[ 4] = k_madd_epi32(u[0], k32_p16_m16); |
michael@0 | 1553 | v[ 5] = k_madd_epi32(u[1], k32_p16_m16); |
michael@0 | 1554 | v[ 6] = k_madd_epi32(u[2], k32_p16_m16); |
michael@0 | 1555 | v[ 7] = k_madd_epi32(u[3], k32_p16_m16); |
michael@0 | 1556 | v[ 8] = k_madd_epi32(u[4], k32_p24_p08); |
michael@0 | 1557 | v[ 9] = k_madd_epi32(u[5], k32_p24_p08); |
michael@0 | 1558 | v[10] = k_madd_epi32(u[6], k32_p24_p08); |
michael@0 | 1559 | v[11] = k_madd_epi32(u[7], k32_p24_p08); |
michael@0 | 1560 | v[12] = k_madd_epi32(u[4], k32_m08_p24); |
michael@0 | 1561 | v[13] = k_madd_epi32(u[5], k32_m08_p24); |
michael@0 | 1562 | v[14] = k_madd_epi32(u[6], k32_m08_p24); |
michael@0 | 1563 | v[15] = k_madd_epi32(u[7], k32_m08_p24); |
michael@0 | 1564 | |
michael@0 | 1565 | u[0] = k_packs_epi64(v[0], v[1]); |
michael@0 | 1566 | u[1] = k_packs_epi64(v[2], v[3]); |
michael@0 | 1567 | u[2] = k_packs_epi64(v[4], v[5]); |
michael@0 | 1568 | u[3] = k_packs_epi64(v[6], v[7]); |
michael@0 | 1569 | u[4] = k_packs_epi64(v[8], v[9]); |
michael@0 | 1570 | u[5] = k_packs_epi64(v[10], v[11]); |
michael@0 | 1571 | u[6] = k_packs_epi64(v[12], v[13]); |
michael@0 | 1572 | u[7] = k_packs_epi64(v[14], v[15]); |
michael@0 | 1573 | |
michael@0 | 1574 | v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); |
michael@0 | 1575 | v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); |
michael@0 | 1576 | v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); |
michael@0 | 1577 | v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); |
michael@0 | 1578 | v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); |
michael@0 | 1579 | v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); |
michael@0 | 1580 | v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); |
michael@0 | 1581 | v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); |
michael@0 | 1582 | |
michael@0 | 1583 | u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
michael@0 | 1584 | u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
michael@0 | 1585 | u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
michael@0 | 1586 | u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
michael@0 | 1587 | u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); |
michael@0 | 1588 | u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); |
michael@0 | 1589 | u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); |
michael@0 | 1590 | u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); |
michael@0 | 1591 | |
michael@0 | 1592 | sign[0] = _mm_cmplt_epi32(u[0], kZero); |
michael@0 | 1593 | sign[1] = _mm_cmplt_epi32(u[1], kZero); |
michael@0 | 1594 | sign[2] = _mm_cmplt_epi32(u[2], kZero); |
michael@0 | 1595 | sign[3] = _mm_cmplt_epi32(u[3], kZero); |
michael@0 | 1596 | sign[4] = _mm_cmplt_epi32(u[4], kZero); |
michael@0 | 1597 | sign[5] = _mm_cmplt_epi32(u[5], kZero); |
michael@0 | 1598 | sign[6] = _mm_cmplt_epi32(u[6], kZero); |
michael@0 | 1599 | sign[7] = _mm_cmplt_epi32(u[7], kZero); |
michael@0 | 1600 | |
michael@0 | 1601 | u[0] = _mm_sub_epi32(u[0], sign[0]); |
michael@0 | 1602 | u[1] = _mm_sub_epi32(u[1], sign[1]); |
michael@0 | 1603 | u[2] = _mm_sub_epi32(u[2], sign[2]); |
michael@0 | 1604 | u[3] = _mm_sub_epi32(u[3], sign[3]); |
michael@0 | 1605 | u[4] = _mm_sub_epi32(u[4], sign[4]); |
michael@0 | 1606 | u[5] = _mm_sub_epi32(u[5], sign[5]); |
michael@0 | 1607 | u[6] = _mm_sub_epi32(u[6], sign[6]); |
michael@0 | 1608 | u[7] = _mm_sub_epi32(u[7], sign[7]); |
michael@0 | 1609 | |
michael@0 | 1610 | u[0] = _mm_add_epi32(u[0], K32One); |
michael@0 | 1611 | u[1] = _mm_add_epi32(u[1], K32One); |
michael@0 | 1612 | u[2] = _mm_add_epi32(u[2], K32One); |
michael@0 | 1613 | u[3] = _mm_add_epi32(u[3], K32One); |
michael@0 | 1614 | u[4] = _mm_add_epi32(u[4], K32One); |
michael@0 | 1615 | u[5] = _mm_add_epi32(u[5], K32One); |
michael@0 | 1616 | u[6] = _mm_add_epi32(u[6], K32One); |
michael@0 | 1617 | u[7] = _mm_add_epi32(u[7], K32One); |
michael@0 | 1618 | |
michael@0 | 1619 | u[0] = _mm_srai_epi32(u[0], 2); |
michael@0 | 1620 | u[1] = _mm_srai_epi32(u[1], 2); |
michael@0 | 1621 | u[2] = _mm_srai_epi32(u[2], 2); |
michael@0 | 1622 | u[3] = _mm_srai_epi32(u[3], 2); |
michael@0 | 1623 | u[4] = _mm_srai_epi32(u[4], 2); |
michael@0 | 1624 | u[5] = _mm_srai_epi32(u[5], 2); |
michael@0 | 1625 | u[6] = _mm_srai_epi32(u[6], 2); |
michael@0 | 1626 | u[7] = _mm_srai_epi32(u[7], 2); |
michael@0 | 1627 | |
michael@0 | 1628 | // Combine |
michael@0 | 1629 | out[ 0] = _mm_packs_epi32(u[0], u[1]); |
michael@0 | 1630 | out[16] = _mm_packs_epi32(u[2], u[3]); |
michael@0 | 1631 | out[ 8] = _mm_packs_epi32(u[4], u[5]); |
michael@0 | 1632 | out[24] = _mm_packs_epi32(u[6], u[7]); |
michael@0 | 1633 | } |
michael@0 | 1634 | { |
michael@0 | 1635 | const __m128i k32_m08_p24 = pair_set_epi32(-cospi_8_64, cospi_24_64); |
michael@0 | 1636 | const __m128i k32_m24_m08 = pair_set_epi32(-cospi_24_64, -cospi_8_64); |
michael@0 | 1637 | const __m128i k32_p24_p08 = pair_set_epi32(cospi_24_64, cospi_8_64); |
michael@0 | 1638 | |
michael@0 | 1639 | u[0] = _mm_unpacklo_epi32(lstep1[18], lstep1[28]); |
michael@0 | 1640 | u[1] = _mm_unpackhi_epi32(lstep1[18], lstep1[28]); |
michael@0 | 1641 | u[2] = _mm_unpacklo_epi32(lstep1[19], lstep1[29]); |
michael@0 | 1642 | u[3] = _mm_unpackhi_epi32(lstep1[19], lstep1[29]); |
michael@0 | 1643 | u[4] = _mm_unpacklo_epi32(lstep1[20], lstep1[26]); |
michael@0 | 1644 | u[5] = _mm_unpackhi_epi32(lstep1[20], lstep1[26]); |
michael@0 | 1645 | u[6] = _mm_unpacklo_epi32(lstep1[21], lstep1[27]); |
michael@0 | 1646 | u[7] = _mm_unpackhi_epi32(lstep1[21], lstep1[27]); |
michael@0 | 1647 | |
michael@0 | 1648 | v[0] = k_madd_epi32(u[0], k32_m08_p24); |
michael@0 | 1649 | v[1] = k_madd_epi32(u[1], k32_m08_p24); |
michael@0 | 1650 | v[2] = k_madd_epi32(u[2], k32_m08_p24); |
michael@0 | 1651 | v[3] = k_madd_epi32(u[3], k32_m08_p24); |
michael@0 | 1652 | v[4] = k_madd_epi32(u[4], k32_m24_m08); |
michael@0 | 1653 | v[5] = k_madd_epi32(u[5], k32_m24_m08); |
michael@0 | 1654 | v[6] = k_madd_epi32(u[6], k32_m24_m08); |
michael@0 | 1655 | v[7] = k_madd_epi32(u[7], k32_m24_m08); |
michael@0 | 1656 | v[ 8] = k_madd_epi32(u[4], k32_m08_p24); |
michael@0 | 1657 | v[ 9] = k_madd_epi32(u[5], k32_m08_p24); |
michael@0 | 1658 | v[10] = k_madd_epi32(u[6], k32_m08_p24); |
michael@0 | 1659 | v[11] = k_madd_epi32(u[7], k32_m08_p24); |
michael@0 | 1660 | v[12] = k_madd_epi32(u[0], k32_p24_p08); |
michael@0 | 1661 | v[13] = k_madd_epi32(u[1], k32_p24_p08); |
michael@0 | 1662 | v[14] = k_madd_epi32(u[2], k32_p24_p08); |
michael@0 | 1663 | v[15] = k_madd_epi32(u[3], k32_p24_p08); |
michael@0 | 1664 | |
michael@0 | 1665 | u[0] = k_packs_epi64(v[0], v[1]); |
michael@0 | 1666 | u[1] = k_packs_epi64(v[2], v[3]); |
michael@0 | 1667 | u[2] = k_packs_epi64(v[4], v[5]); |
michael@0 | 1668 | u[3] = k_packs_epi64(v[6], v[7]); |
michael@0 | 1669 | u[4] = k_packs_epi64(v[8], v[9]); |
michael@0 | 1670 | u[5] = k_packs_epi64(v[10], v[11]); |
michael@0 | 1671 | u[6] = k_packs_epi64(v[12], v[13]); |
michael@0 | 1672 | u[7] = k_packs_epi64(v[14], v[15]); |
michael@0 | 1673 | |
michael@0 | 1674 | u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); |
michael@0 | 1675 | u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); |
michael@0 | 1676 | u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); |
michael@0 | 1677 | u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); |
michael@0 | 1678 | u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); |
michael@0 | 1679 | u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); |
michael@0 | 1680 | u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); |
michael@0 | 1681 | u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); |
michael@0 | 1682 | |
michael@0 | 1683 | lstep2[18] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
michael@0 | 1684 | lstep2[19] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
michael@0 | 1685 | lstep2[20] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
michael@0 | 1686 | lstep2[21] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
michael@0 | 1687 | lstep2[26] = _mm_srai_epi32(u[4], DCT_CONST_BITS); |
michael@0 | 1688 | lstep2[27] = _mm_srai_epi32(u[5], DCT_CONST_BITS); |
michael@0 | 1689 | lstep2[28] = _mm_srai_epi32(u[6], DCT_CONST_BITS); |
michael@0 | 1690 | lstep2[29] = _mm_srai_epi32(u[7], DCT_CONST_BITS); |
michael@0 | 1691 | } |
michael@0 | 1692 | { |
michael@0 | 1693 | lstep2[32] = _mm_add_epi32(lstep1[38], lstep3[32]); |
michael@0 | 1694 | lstep2[33] = _mm_add_epi32(lstep1[39], lstep3[33]); |
michael@0 | 1695 | lstep2[34] = _mm_add_epi32(lstep1[36], lstep3[34]); |
michael@0 | 1696 | lstep2[35] = _mm_add_epi32(lstep1[37], lstep3[35]); |
michael@0 | 1697 | lstep2[36] = _mm_sub_epi32(lstep3[34], lstep1[36]); |
michael@0 | 1698 | lstep2[37] = _mm_sub_epi32(lstep3[35], lstep1[37]); |
michael@0 | 1699 | lstep2[38] = _mm_sub_epi32(lstep3[32], lstep1[38]); |
michael@0 | 1700 | lstep2[39] = _mm_sub_epi32(lstep3[33], lstep1[39]); |
michael@0 | 1701 | lstep2[40] = _mm_sub_epi32(lstep3[46], lstep1[40]); |
michael@0 | 1702 | lstep2[41] = _mm_sub_epi32(lstep3[47], lstep1[41]); |
michael@0 | 1703 | lstep2[42] = _mm_sub_epi32(lstep3[44], lstep1[42]); |
michael@0 | 1704 | lstep2[43] = _mm_sub_epi32(lstep3[45], lstep1[43]); |
michael@0 | 1705 | lstep2[44] = _mm_add_epi32(lstep1[42], lstep3[44]); |
michael@0 | 1706 | lstep2[45] = _mm_add_epi32(lstep1[43], lstep3[45]); |
michael@0 | 1707 | lstep2[46] = _mm_add_epi32(lstep1[40], lstep3[46]); |
michael@0 | 1708 | lstep2[47] = _mm_add_epi32(lstep1[41], lstep3[47]); |
michael@0 | 1709 | lstep2[48] = _mm_add_epi32(lstep1[54], lstep3[48]); |
michael@0 | 1710 | lstep2[49] = _mm_add_epi32(lstep1[55], lstep3[49]); |
michael@0 | 1711 | lstep2[50] = _mm_add_epi32(lstep1[52], lstep3[50]); |
michael@0 | 1712 | lstep2[51] = _mm_add_epi32(lstep1[53], lstep3[51]); |
michael@0 | 1713 | lstep2[52] = _mm_sub_epi32(lstep3[50], lstep1[52]); |
michael@0 | 1714 | lstep2[53] = _mm_sub_epi32(lstep3[51], lstep1[53]); |
michael@0 | 1715 | lstep2[54] = _mm_sub_epi32(lstep3[48], lstep1[54]); |
michael@0 | 1716 | lstep2[55] = _mm_sub_epi32(lstep3[49], lstep1[55]); |
michael@0 | 1717 | lstep2[56] = _mm_sub_epi32(lstep3[62], lstep1[56]); |
michael@0 | 1718 | lstep2[57] = _mm_sub_epi32(lstep3[63], lstep1[57]); |
michael@0 | 1719 | lstep2[58] = _mm_sub_epi32(lstep3[60], lstep1[58]); |
michael@0 | 1720 | lstep2[59] = _mm_sub_epi32(lstep3[61], lstep1[59]); |
michael@0 | 1721 | lstep2[60] = _mm_add_epi32(lstep1[58], lstep3[60]); |
michael@0 | 1722 | lstep2[61] = _mm_add_epi32(lstep1[59], lstep3[61]); |
michael@0 | 1723 | lstep2[62] = _mm_add_epi32(lstep1[56], lstep3[62]); |
michael@0 | 1724 | lstep2[63] = _mm_add_epi32(lstep1[57], lstep3[63]); |
michael@0 | 1725 | } |
michael@0 | 1726 | // stage 6 |
michael@0 | 1727 | { |
michael@0 | 1728 | const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); |
michael@0 | 1729 | const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); |
michael@0 | 1730 | const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); |
michael@0 | 1731 | const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); |
michael@0 | 1732 | |
michael@0 | 1733 | u[0] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]); |
michael@0 | 1734 | u[1] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]); |
michael@0 | 1735 | u[2] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]); |
michael@0 | 1736 | u[3] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]); |
michael@0 | 1737 | u[4] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); |
michael@0 | 1738 | u[5] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); |
michael@0 | 1739 | u[6] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); |
michael@0 | 1740 | u[7] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); |
michael@0 | 1741 | u[8] = _mm_unpacklo_epi32(lstep2[10], lstep2[12]); |
michael@0 | 1742 | u[9] = _mm_unpackhi_epi32(lstep2[10], lstep2[12]); |
michael@0 | 1743 | u[10] = _mm_unpacklo_epi32(lstep2[11], lstep2[13]); |
michael@0 | 1744 | u[11] = _mm_unpackhi_epi32(lstep2[11], lstep2[13]); |
michael@0 | 1745 | u[12] = _mm_unpacklo_epi32(lstep2[ 8], lstep2[14]); |
michael@0 | 1746 | u[13] = _mm_unpackhi_epi32(lstep2[ 8], lstep2[14]); |
michael@0 | 1747 | u[14] = _mm_unpacklo_epi32(lstep2[ 9], lstep2[15]); |
michael@0 | 1748 | u[15] = _mm_unpackhi_epi32(lstep2[ 9], lstep2[15]); |
michael@0 | 1749 | |
michael@0 | 1750 | v[0] = k_madd_epi32(u[0], k32_p28_p04); |
michael@0 | 1751 | v[1] = k_madd_epi32(u[1], k32_p28_p04); |
michael@0 | 1752 | v[2] = k_madd_epi32(u[2], k32_p28_p04); |
michael@0 | 1753 | v[3] = k_madd_epi32(u[3], k32_p28_p04); |
michael@0 | 1754 | v[4] = k_madd_epi32(u[4], k32_p12_p20); |
michael@0 | 1755 | v[5] = k_madd_epi32(u[5], k32_p12_p20); |
michael@0 | 1756 | v[6] = k_madd_epi32(u[6], k32_p12_p20); |
michael@0 | 1757 | v[7] = k_madd_epi32(u[7], k32_p12_p20); |
michael@0 | 1758 | v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12); |
michael@0 | 1759 | v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12); |
michael@0 | 1760 | v[10] = k_madd_epi32(u[10], k32_m20_p12); |
michael@0 | 1761 | v[11] = k_madd_epi32(u[11], k32_m20_p12); |
michael@0 | 1762 | v[12] = k_madd_epi32(u[12], k32_m04_p28); |
michael@0 | 1763 | v[13] = k_madd_epi32(u[13], k32_m04_p28); |
michael@0 | 1764 | v[14] = k_madd_epi32(u[14], k32_m04_p28); |
michael@0 | 1765 | v[15] = k_madd_epi32(u[15], k32_m04_p28); |
michael@0 | 1766 | |
michael@0 | 1767 | u[0] = k_packs_epi64(v[0], v[1]); |
michael@0 | 1768 | u[1] = k_packs_epi64(v[2], v[3]); |
michael@0 | 1769 | u[2] = k_packs_epi64(v[4], v[5]); |
michael@0 | 1770 | u[3] = k_packs_epi64(v[6], v[7]); |
michael@0 | 1771 | u[4] = k_packs_epi64(v[8], v[9]); |
michael@0 | 1772 | u[5] = k_packs_epi64(v[10], v[11]); |
michael@0 | 1773 | u[6] = k_packs_epi64(v[12], v[13]); |
michael@0 | 1774 | u[7] = k_packs_epi64(v[14], v[15]); |
michael@0 | 1775 | |
michael@0 | 1776 | v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); |
michael@0 | 1777 | v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); |
michael@0 | 1778 | v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); |
michael@0 | 1779 | v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); |
michael@0 | 1780 | v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); |
michael@0 | 1781 | v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); |
michael@0 | 1782 | v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); |
michael@0 | 1783 | v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); |
michael@0 | 1784 | |
michael@0 | 1785 | u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
michael@0 | 1786 | u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
michael@0 | 1787 | u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
michael@0 | 1788 | u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
michael@0 | 1789 | u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); |
michael@0 | 1790 | u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); |
michael@0 | 1791 | u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); |
michael@0 | 1792 | u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); |
michael@0 | 1793 | |
michael@0 | 1794 | sign[0] = _mm_cmplt_epi32(u[0], kZero); |
michael@0 | 1795 | sign[1] = _mm_cmplt_epi32(u[1], kZero); |
michael@0 | 1796 | sign[2] = _mm_cmplt_epi32(u[2], kZero); |
michael@0 | 1797 | sign[3] = _mm_cmplt_epi32(u[3], kZero); |
michael@0 | 1798 | sign[4] = _mm_cmplt_epi32(u[4], kZero); |
michael@0 | 1799 | sign[5] = _mm_cmplt_epi32(u[5], kZero); |
michael@0 | 1800 | sign[6] = _mm_cmplt_epi32(u[6], kZero); |
michael@0 | 1801 | sign[7] = _mm_cmplt_epi32(u[7], kZero); |
michael@0 | 1802 | |
michael@0 | 1803 | u[0] = _mm_sub_epi32(u[0], sign[0]); |
michael@0 | 1804 | u[1] = _mm_sub_epi32(u[1], sign[1]); |
michael@0 | 1805 | u[2] = _mm_sub_epi32(u[2], sign[2]); |
michael@0 | 1806 | u[3] = _mm_sub_epi32(u[3], sign[3]); |
michael@0 | 1807 | u[4] = _mm_sub_epi32(u[4], sign[4]); |
michael@0 | 1808 | u[5] = _mm_sub_epi32(u[5], sign[5]); |
michael@0 | 1809 | u[6] = _mm_sub_epi32(u[6], sign[6]); |
michael@0 | 1810 | u[7] = _mm_sub_epi32(u[7], sign[7]); |
michael@0 | 1811 | |
michael@0 | 1812 | u[0] = _mm_add_epi32(u[0], K32One); |
michael@0 | 1813 | u[1] = _mm_add_epi32(u[1], K32One); |
michael@0 | 1814 | u[2] = _mm_add_epi32(u[2], K32One); |
michael@0 | 1815 | u[3] = _mm_add_epi32(u[3], K32One); |
michael@0 | 1816 | u[4] = _mm_add_epi32(u[4], K32One); |
michael@0 | 1817 | u[5] = _mm_add_epi32(u[5], K32One); |
michael@0 | 1818 | u[6] = _mm_add_epi32(u[6], K32One); |
michael@0 | 1819 | u[7] = _mm_add_epi32(u[7], K32One); |
michael@0 | 1820 | |
michael@0 | 1821 | u[0] = _mm_srai_epi32(u[0], 2); |
michael@0 | 1822 | u[1] = _mm_srai_epi32(u[1], 2); |
michael@0 | 1823 | u[2] = _mm_srai_epi32(u[2], 2); |
michael@0 | 1824 | u[3] = _mm_srai_epi32(u[3], 2); |
michael@0 | 1825 | u[4] = _mm_srai_epi32(u[4], 2); |
michael@0 | 1826 | u[5] = _mm_srai_epi32(u[5], 2); |
michael@0 | 1827 | u[6] = _mm_srai_epi32(u[6], 2); |
michael@0 | 1828 | u[7] = _mm_srai_epi32(u[7], 2); |
michael@0 | 1829 | |
michael@0 | 1830 | out[ 4] = _mm_packs_epi32(u[0], u[1]); |
michael@0 | 1831 | out[20] = _mm_packs_epi32(u[2], u[3]); |
michael@0 | 1832 | out[12] = _mm_packs_epi32(u[4], u[5]); |
michael@0 | 1833 | out[28] = _mm_packs_epi32(u[6], u[7]); |
michael@0 | 1834 | } |
michael@0 | 1835 | { |
michael@0 | 1836 | lstep3[16] = _mm_add_epi32(lstep2[18], lstep1[16]); |
michael@0 | 1837 | lstep3[17] = _mm_add_epi32(lstep2[19], lstep1[17]); |
michael@0 | 1838 | lstep3[18] = _mm_sub_epi32(lstep1[16], lstep2[18]); |
michael@0 | 1839 | lstep3[19] = _mm_sub_epi32(lstep1[17], lstep2[19]); |
michael@0 | 1840 | lstep3[20] = _mm_sub_epi32(lstep1[22], lstep2[20]); |
michael@0 | 1841 | lstep3[21] = _mm_sub_epi32(lstep1[23], lstep2[21]); |
michael@0 | 1842 | lstep3[22] = _mm_add_epi32(lstep2[20], lstep1[22]); |
michael@0 | 1843 | lstep3[23] = _mm_add_epi32(lstep2[21], lstep1[23]); |
michael@0 | 1844 | lstep3[24] = _mm_add_epi32(lstep2[26], lstep1[24]); |
michael@0 | 1845 | lstep3[25] = _mm_add_epi32(lstep2[27], lstep1[25]); |
michael@0 | 1846 | lstep3[26] = _mm_sub_epi32(lstep1[24], lstep2[26]); |
michael@0 | 1847 | lstep3[27] = _mm_sub_epi32(lstep1[25], lstep2[27]); |
michael@0 | 1848 | lstep3[28] = _mm_sub_epi32(lstep1[30], lstep2[28]); |
michael@0 | 1849 | lstep3[29] = _mm_sub_epi32(lstep1[31], lstep2[29]); |
michael@0 | 1850 | lstep3[30] = _mm_add_epi32(lstep2[28], lstep1[30]); |
michael@0 | 1851 | lstep3[31] = _mm_add_epi32(lstep2[29], lstep1[31]); |
michael@0 | 1852 | } |
michael@0 | 1853 | { |
michael@0 | 1854 | const __m128i k32_m04_p28 = pair_set_epi32(-cospi_4_64, cospi_28_64); |
michael@0 | 1855 | const __m128i k32_m28_m04 = pair_set_epi32(-cospi_28_64, -cospi_4_64); |
michael@0 | 1856 | const __m128i k32_m20_p12 = pair_set_epi32(-cospi_20_64, cospi_12_64); |
michael@0 | 1857 | const __m128i k32_m12_m20 = pair_set_epi32(-cospi_12_64, |
michael@0 | 1858 | -cospi_20_64); |
michael@0 | 1859 | const __m128i k32_p12_p20 = pair_set_epi32(cospi_12_64, cospi_20_64); |
michael@0 | 1860 | const __m128i k32_p28_p04 = pair_set_epi32(cospi_28_64, cospi_4_64); |
michael@0 | 1861 | |
michael@0 | 1862 | u[ 0] = _mm_unpacklo_epi32(lstep2[34], lstep2[60]); |
michael@0 | 1863 | u[ 1] = _mm_unpackhi_epi32(lstep2[34], lstep2[60]); |
michael@0 | 1864 | u[ 2] = _mm_unpacklo_epi32(lstep2[35], lstep2[61]); |
michael@0 | 1865 | u[ 3] = _mm_unpackhi_epi32(lstep2[35], lstep2[61]); |
michael@0 | 1866 | u[ 4] = _mm_unpacklo_epi32(lstep2[36], lstep2[58]); |
michael@0 | 1867 | u[ 5] = _mm_unpackhi_epi32(lstep2[36], lstep2[58]); |
michael@0 | 1868 | u[ 6] = _mm_unpacklo_epi32(lstep2[37], lstep2[59]); |
michael@0 | 1869 | u[ 7] = _mm_unpackhi_epi32(lstep2[37], lstep2[59]); |
michael@0 | 1870 | u[ 8] = _mm_unpacklo_epi32(lstep2[42], lstep2[52]); |
michael@0 | 1871 | u[ 9] = _mm_unpackhi_epi32(lstep2[42], lstep2[52]); |
michael@0 | 1872 | u[10] = _mm_unpacklo_epi32(lstep2[43], lstep2[53]); |
michael@0 | 1873 | u[11] = _mm_unpackhi_epi32(lstep2[43], lstep2[53]); |
michael@0 | 1874 | u[12] = _mm_unpacklo_epi32(lstep2[44], lstep2[50]); |
michael@0 | 1875 | u[13] = _mm_unpackhi_epi32(lstep2[44], lstep2[50]); |
michael@0 | 1876 | u[14] = _mm_unpacklo_epi32(lstep2[45], lstep2[51]); |
michael@0 | 1877 | u[15] = _mm_unpackhi_epi32(lstep2[45], lstep2[51]); |
michael@0 | 1878 | |
michael@0 | 1879 | v[ 0] = k_madd_epi32(u[ 0], k32_m04_p28); |
michael@0 | 1880 | v[ 1] = k_madd_epi32(u[ 1], k32_m04_p28); |
michael@0 | 1881 | v[ 2] = k_madd_epi32(u[ 2], k32_m04_p28); |
michael@0 | 1882 | v[ 3] = k_madd_epi32(u[ 3], k32_m04_p28); |
michael@0 | 1883 | v[ 4] = k_madd_epi32(u[ 4], k32_m28_m04); |
michael@0 | 1884 | v[ 5] = k_madd_epi32(u[ 5], k32_m28_m04); |
michael@0 | 1885 | v[ 6] = k_madd_epi32(u[ 6], k32_m28_m04); |
michael@0 | 1886 | v[ 7] = k_madd_epi32(u[ 7], k32_m28_m04); |
michael@0 | 1887 | v[ 8] = k_madd_epi32(u[ 8], k32_m20_p12); |
michael@0 | 1888 | v[ 9] = k_madd_epi32(u[ 9], k32_m20_p12); |
michael@0 | 1889 | v[10] = k_madd_epi32(u[10], k32_m20_p12); |
michael@0 | 1890 | v[11] = k_madd_epi32(u[11], k32_m20_p12); |
michael@0 | 1891 | v[12] = k_madd_epi32(u[12], k32_m12_m20); |
michael@0 | 1892 | v[13] = k_madd_epi32(u[13], k32_m12_m20); |
michael@0 | 1893 | v[14] = k_madd_epi32(u[14], k32_m12_m20); |
michael@0 | 1894 | v[15] = k_madd_epi32(u[15], k32_m12_m20); |
michael@0 | 1895 | v[16] = k_madd_epi32(u[12], k32_m20_p12); |
michael@0 | 1896 | v[17] = k_madd_epi32(u[13], k32_m20_p12); |
michael@0 | 1897 | v[18] = k_madd_epi32(u[14], k32_m20_p12); |
michael@0 | 1898 | v[19] = k_madd_epi32(u[15], k32_m20_p12); |
michael@0 | 1899 | v[20] = k_madd_epi32(u[ 8], k32_p12_p20); |
michael@0 | 1900 | v[21] = k_madd_epi32(u[ 9], k32_p12_p20); |
michael@0 | 1901 | v[22] = k_madd_epi32(u[10], k32_p12_p20); |
michael@0 | 1902 | v[23] = k_madd_epi32(u[11], k32_p12_p20); |
michael@0 | 1903 | v[24] = k_madd_epi32(u[ 4], k32_m04_p28); |
michael@0 | 1904 | v[25] = k_madd_epi32(u[ 5], k32_m04_p28); |
michael@0 | 1905 | v[26] = k_madd_epi32(u[ 6], k32_m04_p28); |
michael@0 | 1906 | v[27] = k_madd_epi32(u[ 7], k32_m04_p28); |
michael@0 | 1907 | v[28] = k_madd_epi32(u[ 0], k32_p28_p04); |
michael@0 | 1908 | v[29] = k_madd_epi32(u[ 1], k32_p28_p04); |
michael@0 | 1909 | v[30] = k_madd_epi32(u[ 2], k32_p28_p04); |
michael@0 | 1910 | v[31] = k_madd_epi32(u[ 3], k32_p28_p04); |
michael@0 | 1911 | |
michael@0 | 1912 | u[ 0] = k_packs_epi64(v[ 0], v[ 1]); |
michael@0 | 1913 | u[ 1] = k_packs_epi64(v[ 2], v[ 3]); |
michael@0 | 1914 | u[ 2] = k_packs_epi64(v[ 4], v[ 5]); |
michael@0 | 1915 | u[ 3] = k_packs_epi64(v[ 6], v[ 7]); |
michael@0 | 1916 | u[ 4] = k_packs_epi64(v[ 8], v[ 9]); |
michael@0 | 1917 | u[ 5] = k_packs_epi64(v[10], v[11]); |
michael@0 | 1918 | u[ 6] = k_packs_epi64(v[12], v[13]); |
michael@0 | 1919 | u[ 7] = k_packs_epi64(v[14], v[15]); |
michael@0 | 1920 | u[ 8] = k_packs_epi64(v[16], v[17]); |
michael@0 | 1921 | u[ 9] = k_packs_epi64(v[18], v[19]); |
michael@0 | 1922 | u[10] = k_packs_epi64(v[20], v[21]); |
michael@0 | 1923 | u[11] = k_packs_epi64(v[22], v[23]); |
michael@0 | 1924 | u[12] = k_packs_epi64(v[24], v[25]); |
michael@0 | 1925 | u[13] = k_packs_epi64(v[26], v[27]); |
michael@0 | 1926 | u[14] = k_packs_epi64(v[28], v[29]); |
michael@0 | 1927 | u[15] = k_packs_epi64(v[30], v[31]); |
michael@0 | 1928 | |
michael@0 | 1929 | v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); |
michael@0 | 1930 | v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); |
michael@0 | 1931 | v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); |
michael@0 | 1932 | v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); |
michael@0 | 1933 | v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); |
michael@0 | 1934 | v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); |
michael@0 | 1935 | v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); |
michael@0 | 1936 | v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); |
michael@0 | 1937 | v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); |
michael@0 | 1938 | v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); |
michael@0 | 1939 | v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); |
michael@0 | 1940 | v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); |
michael@0 | 1941 | v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); |
michael@0 | 1942 | v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); |
michael@0 | 1943 | v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); |
michael@0 | 1944 | v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); |
michael@0 | 1945 | |
michael@0 | 1946 | lstep3[34] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); |
michael@0 | 1947 | lstep3[35] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); |
michael@0 | 1948 | lstep3[36] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); |
michael@0 | 1949 | lstep3[37] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); |
michael@0 | 1950 | lstep3[42] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); |
michael@0 | 1951 | lstep3[43] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); |
michael@0 | 1952 | lstep3[44] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); |
michael@0 | 1953 | lstep3[45] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); |
michael@0 | 1954 | lstep3[50] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); |
michael@0 | 1955 | lstep3[51] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); |
michael@0 | 1956 | lstep3[52] = _mm_srai_epi32(v[10], DCT_CONST_BITS); |
michael@0 | 1957 | lstep3[53] = _mm_srai_epi32(v[11], DCT_CONST_BITS); |
michael@0 | 1958 | lstep3[58] = _mm_srai_epi32(v[12], DCT_CONST_BITS); |
michael@0 | 1959 | lstep3[59] = _mm_srai_epi32(v[13], DCT_CONST_BITS); |
michael@0 | 1960 | lstep3[60] = _mm_srai_epi32(v[14], DCT_CONST_BITS); |
michael@0 | 1961 | lstep3[61] = _mm_srai_epi32(v[15], DCT_CONST_BITS); |
michael@0 | 1962 | } |
michael@0 | 1963 | // stage 7 |
michael@0 | 1964 | { |
michael@0 | 1965 | const __m128i k32_p30_p02 = pair_set_epi32(cospi_30_64, cospi_2_64); |
michael@0 | 1966 | const __m128i k32_p14_p18 = pair_set_epi32(cospi_14_64, cospi_18_64); |
michael@0 | 1967 | const __m128i k32_p22_p10 = pair_set_epi32(cospi_22_64, cospi_10_64); |
michael@0 | 1968 | const __m128i k32_p06_p26 = pair_set_epi32(cospi_6_64, cospi_26_64); |
michael@0 | 1969 | const __m128i k32_m26_p06 = pair_set_epi32(-cospi_26_64, cospi_6_64); |
michael@0 | 1970 | const __m128i k32_m10_p22 = pair_set_epi32(-cospi_10_64, cospi_22_64); |
michael@0 | 1971 | const __m128i k32_m18_p14 = pair_set_epi32(-cospi_18_64, cospi_14_64); |
michael@0 | 1972 | const __m128i k32_m02_p30 = pair_set_epi32(-cospi_2_64, cospi_30_64); |
michael@0 | 1973 | |
michael@0 | 1974 | u[ 0] = _mm_unpacklo_epi32(lstep3[16], lstep3[30]); |
michael@0 | 1975 | u[ 1] = _mm_unpackhi_epi32(lstep3[16], lstep3[30]); |
michael@0 | 1976 | u[ 2] = _mm_unpacklo_epi32(lstep3[17], lstep3[31]); |
michael@0 | 1977 | u[ 3] = _mm_unpackhi_epi32(lstep3[17], lstep3[31]); |
michael@0 | 1978 | u[ 4] = _mm_unpacklo_epi32(lstep3[18], lstep3[28]); |
michael@0 | 1979 | u[ 5] = _mm_unpackhi_epi32(lstep3[18], lstep3[28]); |
michael@0 | 1980 | u[ 6] = _mm_unpacklo_epi32(lstep3[19], lstep3[29]); |
michael@0 | 1981 | u[ 7] = _mm_unpackhi_epi32(lstep3[19], lstep3[29]); |
michael@0 | 1982 | u[ 8] = _mm_unpacklo_epi32(lstep3[20], lstep3[26]); |
michael@0 | 1983 | u[ 9] = _mm_unpackhi_epi32(lstep3[20], lstep3[26]); |
michael@0 | 1984 | u[10] = _mm_unpacklo_epi32(lstep3[21], lstep3[27]); |
michael@0 | 1985 | u[11] = _mm_unpackhi_epi32(lstep3[21], lstep3[27]); |
michael@0 | 1986 | u[12] = _mm_unpacklo_epi32(lstep3[22], lstep3[24]); |
michael@0 | 1987 | u[13] = _mm_unpackhi_epi32(lstep3[22], lstep3[24]); |
michael@0 | 1988 | u[14] = _mm_unpacklo_epi32(lstep3[23], lstep3[25]); |
michael@0 | 1989 | u[15] = _mm_unpackhi_epi32(lstep3[23], lstep3[25]); |
michael@0 | 1990 | |
michael@0 | 1991 | v[ 0] = k_madd_epi32(u[ 0], k32_p30_p02); |
michael@0 | 1992 | v[ 1] = k_madd_epi32(u[ 1], k32_p30_p02); |
michael@0 | 1993 | v[ 2] = k_madd_epi32(u[ 2], k32_p30_p02); |
michael@0 | 1994 | v[ 3] = k_madd_epi32(u[ 3], k32_p30_p02); |
michael@0 | 1995 | v[ 4] = k_madd_epi32(u[ 4], k32_p14_p18); |
michael@0 | 1996 | v[ 5] = k_madd_epi32(u[ 5], k32_p14_p18); |
michael@0 | 1997 | v[ 6] = k_madd_epi32(u[ 6], k32_p14_p18); |
michael@0 | 1998 | v[ 7] = k_madd_epi32(u[ 7], k32_p14_p18); |
michael@0 | 1999 | v[ 8] = k_madd_epi32(u[ 8], k32_p22_p10); |
michael@0 | 2000 | v[ 9] = k_madd_epi32(u[ 9], k32_p22_p10); |
michael@0 | 2001 | v[10] = k_madd_epi32(u[10], k32_p22_p10); |
michael@0 | 2002 | v[11] = k_madd_epi32(u[11], k32_p22_p10); |
michael@0 | 2003 | v[12] = k_madd_epi32(u[12], k32_p06_p26); |
michael@0 | 2004 | v[13] = k_madd_epi32(u[13], k32_p06_p26); |
michael@0 | 2005 | v[14] = k_madd_epi32(u[14], k32_p06_p26); |
michael@0 | 2006 | v[15] = k_madd_epi32(u[15], k32_p06_p26); |
michael@0 | 2007 | v[16] = k_madd_epi32(u[12], k32_m26_p06); |
michael@0 | 2008 | v[17] = k_madd_epi32(u[13], k32_m26_p06); |
michael@0 | 2009 | v[18] = k_madd_epi32(u[14], k32_m26_p06); |
michael@0 | 2010 | v[19] = k_madd_epi32(u[15], k32_m26_p06); |
michael@0 | 2011 | v[20] = k_madd_epi32(u[ 8], k32_m10_p22); |
michael@0 | 2012 | v[21] = k_madd_epi32(u[ 9], k32_m10_p22); |
michael@0 | 2013 | v[22] = k_madd_epi32(u[10], k32_m10_p22); |
michael@0 | 2014 | v[23] = k_madd_epi32(u[11], k32_m10_p22); |
michael@0 | 2015 | v[24] = k_madd_epi32(u[ 4], k32_m18_p14); |
michael@0 | 2016 | v[25] = k_madd_epi32(u[ 5], k32_m18_p14); |
michael@0 | 2017 | v[26] = k_madd_epi32(u[ 6], k32_m18_p14); |
michael@0 | 2018 | v[27] = k_madd_epi32(u[ 7], k32_m18_p14); |
michael@0 | 2019 | v[28] = k_madd_epi32(u[ 0], k32_m02_p30); |
michael@0 | 2020 | v[29] = k_madd_epi32(u[ 1], k32_m02_p30); |
michael@0 | 2021 | v[30] = k_madd_epi32(u[ 2], k32_m02_p30); |
michael@0 | 2022 | v[31] = k_madd_epi32(u[ 3], k32_m02_p30); |
michael@0 | 2023 | |
michael@0 | 2024 | u[ 0] = k_packs_epi64(v[ 0], v[ 1]); |
michael@0 | 2025 | u[ 1] = k_packs_epi64(v[ 2], v[ 3]); |
michael@0 | 2026 | u[ 2] = k_packs_epi64(v[ 4], v[ 5]); |
michael@0 | 2027 | u[ 3] = k_packs_epi64(v[ 6], v[ 7]); |
michael@0 | 2028 | u[ 4] = k_packs_epi64(v[ 8], v[ 9]); |
michael@0 | 2029 | u[ 5] = k_packs_epi64(v[10], v[11]); |
michael@0 | 2030 | u[ 6] = k_packs_epi64(v[12], v[13]); |
michael@0 | 2031 | u[ 7] = k_packs_epi64(v[14], v[15]); |
michael@0 | 2032 | u[ 8] = k_packs_epi64(v[16], v[17]); |
michael@0 | 2033 | u[ 9] = k_packs_epi64(v[18], v[19]); |
michael@0 | 2034 | u[10] = k_packs_epi64(v[20], v[21]); |
michael@0 | 2035 | u[11] = k_packs_epi64(v[22], v[23]); |
michael@0 | 2036 | u[12] = k_packs_epi64(v[24], v[25]); |
michael@0 | 2037 | u[13] = k_packs_epi64(v[26], v[27]); |
michael@0 | 2038 | u[14] = k_packs_epi64(v[28], v[29]); |
michael@0 | 2039 | u[15] = k_packs_epi64(v[30], v[31]); |
michael@0 | 2040 | |
michael@0 | 2041 | v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); |
michael@0 | 2042 | v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); |
michael@0 | 2043 | v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); |
michael@0 | 2044 | v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); |
michael@0 | 2045 | v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); |
michael@0 | 2046 | v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); |
michael@0 | 2047 | v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); |
michael@0 | 2048 | v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); |
michael@0 | 2049 | v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); |
michael@0 | 2050 | v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); |
michael@0 | 2051 | v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); |
michael@0 | 2052 | v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); |
michael@0 | 2053 | v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); |
michael@0 | 2054 | v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); |
michael@0 | 2055 | v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); |
michael@0 | 2056 | v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); |
michael@0 | 2057 | |
michael@0 | 2058 | u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); |
michael@0 | 2059 | u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); |
michael@0 | 2060 | u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); |
michael@0 | 2061 | u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); |
michael@0 | 2062 | u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); |
michael@0 | 2063 | u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); |
michael@0 | 2064 | u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); |
michael@0 | 2065 | u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); |
michael@0 | 2066 | u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); |
michael@0 | 2067 | u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); |
michael@0 | 2068 | u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); |
michael@0 | 2069 | u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); |
michael@0 | 2070 | u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); |
michael@0 | 2071 | u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); |
michael@0 | 2072 | u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); |
michael@0 | 2073 | u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); |
michael@0 | 2074 | |
michael@0 | 2075 | v[ 0] = _mm_cmplt_epi32(u[ 0], kZero); |
michael@0 | 2076 | v[ 1] = _mm_cmplt_epi32(u[ 1], kZero); |
michael@0 | 2077 | v[ 2] = _mm_cmplt_epi32(u[ 2], kZero); |
michael@0 | 2078 | v[ 3] = _mm_cmplt_epi32(u[ 3], kZero); |
michael@0 | 2079 | v[ 4] = _mm_cmplt_epi32(u[ 4], kZero); |
michael@0 | 2080 | v[ 5] = _mm_cmplt_epi32(u[ 5], kZero); |
michael@0 | 2081 | v[ 6] = _mm_cmplt_epi32(u[ 6], kZero); |
michael@0 | 2082 | v[ 7] = _mm_cmplt_epi32(u[ 7], kZero); |
michael@0 | 2083 | v[ 8] = _mm_cmplt_epi32(u[ 8], kZero); |
michael@0 | 2084 | v[ 9] = _mm_cmplt_epi32(u[ 9], kZero); |
michael@0 | 2085 | v[10] = _mm_cmplt_epi32(u[10], kZero); |
michael@0 | 2086 | v[11] = _mm_cmplt_epi32(u[11], kZero); |
michael@0 | 2087 | v[12] = _mm_cmplt_epi32(u[12], kZero); |
michael@0 | 2088 | v[13] = _mm_cmplt_epi32(u[13], kZero); |
michael@0 | 2089 | v[14] = _mm_cmplt_epi32(u[14], kZero); |
michael@0 | 2090 | v[15] = _mm_cmplt_epi32(u[15], kZero); |
michael@0 | 2091 | |
michael@0 | 2092 | u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]); |
michael@0 | 2093 | u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]); |
michael@0 | 2094 | u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]); |
michael@0 | 2095 | u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]); |
michael@0 | 2096 | u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]); |
michael@0 | 2097 | u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]); |
michael@0 | 2098 | u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]); |
michael@0 | 2099 | u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]); |
michael@0 | 2100 | u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]); |
michael@0 | 2101 | u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]); |
michael@0 | 2102 | u[10] = _mm_sub_epi32(u[10], v[10]); |
michael@0 | 2103 | u[11] = _mm_sub_epi32(u[11], v[11]); |
michael@0 | 2104 | u[12] = _mm_sub_epi32(u[12], v[12]); |
michael@0 | 2105 | u[13] = _mm_sub_epi32(u[13], v[13]); |
michael@0 | 2106 | u[14] = _mm_sub_epi32(u[14], v[14]); |
michael@0 | 2107 | u[15] = _mm_sub_epi32(u[15], v[15]); |
michael@0 | 2108 | |
michael@0 | 2109 | v[ 0] = _mm_add_epi32(u[ 0], K32One); |
michael@0 | 2110 | v[ 1] = _mm_add_epi32(u[ 1], K32One); |
michael@0 | 2111 | v[ 2] = _mm_add_epi32(u[ 2], K32One); |
michael@0 | 2112 | v[ 3] = _mm_add_epi32(u[ 3], K32One); |
michael@0 | 2113 | v[ 4] = _mm_add_epi32(u[ 4], K32One); |
michael@0 | 2114 | v[ 5] = _mm_add_epi32(u[ 5], K32One); |
michael@0 | 2115 | v[ 6] = _mm_add_epi32(u[ 6], K32One); |
michael@0 | 2116 | v[ 7] = _mm_add_epi32(u[ 7], K32One); |
michael@0 | 2117 | v[ 8] = _mm_add_epi32(u[ 8], K32One); |
michael@0 | 2118 | v[ 9] = _mm_add_epi32(u[ 9], K32One); |
michael@0 | 2119 | v[10] = _mm_add_epi32(u[10], K32One); |
michael@0 | 2120 | v[11] = _mm_add_epi32(u[11], K32One); |
michael@0 | 2121 | v[12] = _mm_add_epi32(u[12], K32One); |
michael@0 | 2122 | v[13] = _mm_add_epi32(u[13], K32One); |
michael@0 | 2123 | v[14] = _mm_add_epi32(u[14], K32One); |
michael@0 | 2124 | v[15] = _mm_add_epi32(u[15], K32One); |
michael@0 | 2125 | |
michael@0 | 2126 | u[ 0] = _mm_srai_epi32(v[ 0], 2); |
michael@0 | 2127 | u[ 1] = _mm_srai_epi32(v[ 1], 2); |
michael@0 | 2128 | u[ 2] = _mm_srai_epi32(v[ 2], 2); |
michael@0 | 2129 | u[ 3] = _mm_srai_epi32(v[ 3], 2); |
michael@0 | 2130 | u[ 4] = _mm_srai_epi32(v[ 4], 2); |
michael@0 | 2131 | u[ 5] = _mm_srai_epi32(v[ 5], 2); |
michael@0 | 2132 | u[ 6] = _mm_srai_epi32(v[ 6], 2); |
michael@0 | 2133 | u[ 7] = _mm_srai_epi32(v[ 7], 2); |
michael@0 | 2134 | u[ 8] = _mm_srai_epi32(v[ 8], 2); |
michael@0 | 2135 | u[ 9] = _mm_srai_epi32(v[ 9], 2); |
michael@0 | 2136 | u[10] = _mm_srai_epi32(v[10], 2); |
michael@0 | 2137 | u[11] = _mm_srai_epi32(v[11], 2); |
michael@0 | 2138 | u[12] = _mm_srai_epi32(v[12], 2); |
michael@0 | 2139 | u[13] = _mm_srai_epi32(v[13], 2); |
michael@0 | 2140 | u[14] = _mm_srai_epi32(v[14], 2); |
michael@0 | 2141 | u[15] = _mm_srai_epi32(v[15], 2); |
michael@0 | 2142 | |
michael@0 | 2143 | out[ 2] = _mm_packs_epi32(u[0], u[1]); |
michael@0 | 2144 | out[18] = _mm_packs_epi32(u[2], u[3]); |
michael@0 | 2145 | out[10] = _mm_packs_epi32(u[4], u[5]); |
michael@0 | 2146 | out[26] = _mm_packs_epi32(u[6], u[7]); |
michael@0 | 2147 | out[ 6] = _mm_packs_epi32(u[8], u[9]); |
michael@0 | 2148 | out[22] = _mm_packs_epi32(u[10], u[11]); |
michael@0 | 2149 | out[14] = _mm_packs_epi32(u[12], u[13]); |
michael@0 | 2150 | out[30] = _mm_packs_epi32(u[14], u[15]); |
michael@0 | 2151 | } |
michael@0 | 2152 | { |
michael@0 | 2153 | lstep1[32] = _mm_add_epi32(lstep3[34], lstep2[32]); |
michael@0 | 2154 | lstep1[33] = _mm_add_epi32(lstep3[35], lstep2[33]); |
michael@0 | 2155 | lstep1[34] = _mm_sub_epi32(lstep2[32], lstep3[34]); |
michael@0 | 2156 | lstep1[35] = _mm_sub_epi32(lstep2[33], lstep3[35]); |
michael@0 | 2157 | lstep1[36] = _mm_sub_epi32(lstep2[38], lstep3[36]); |
michael@0 | 2158 | lstep1[37] = _mm_sub_epi32(lstep2[39], lstep3[37]); |
michael@0 | 2159 | lstep1[38] = _mm_add_epi32(lstep3[36], lstep2[38]); |
michael@0 | 2160 | lstep1[39] = _mm_add_epi32(lstep3[37], lstep2[39]); |
michael@0 | 2161 | lstep1[40] = _mm_add_epi32(lstep3[42], lstep2[40]); |
michael@0 | 2162 | lstep1[41] = _mm_add_epi32(lstep3[43], lstep2[41]); |
michael@0 | 2163 | lstep1[42] = _mm_sub_epi32(lstep2[40], lstep3[42]); |
michael@0 | 2164 | lstep1[43] = _mm_sub_epi32(lstep2[41], lstep3[43]); |
michael@0 | 2165 | lstep1[44] = _mm_sub_epi32(lstep2[46], lstep3[44]); |
michael@0 | 2166 | lstep1[45] = _mm_sub_epi32(lstep2[47], lstep3[45]); |
michael@0 | 2167 | lstep1[46] = _mm_add_epi32(lstep3[44], lstep2[46]); |
michael@0 | 2168 | lstep1[47] = _mm_add_epi32(lstep3[45], lstep2[47]); |
michael@0 | 2169 | lstep1[48] = _mm_add_epi32(lstep3[50], lstep2[48]); |
michael@0 | 2170 | lstep1[49] = _mm_add_epi32(lstep3[51], lstep2[49]); |
michael@0 | 2171 | lstep1[50] = _mm_sub_epi32(lstep2[48], lstep3[50]); |
michael@0 | 2172 | lstep1[51] = _mm_sub_epi32(lstep2[49], lstep3[51]); |
michael@0 | 2173 | lstep1[52] = _mm_sub_epi32(lstep2[54], lstep3[52]); |
michael@0 | 2174 | lstep1[53] = _mm_sub_epi32(lstep2[55], lstep3[53]); |
michael@0 | 2175 | lstep1[54] = _mm_add_epi32(lstep3[52], lstep2[54]); |
michael@0 | 2176 | lstep1[55] = _mm_add_epi32(lstep3[53], lstep2[55]); |
michael@0 | 2177 | lstep1[56] = _mm_add_epi32(lstep3[58], lstep2[56]); |
michael@0 | 2178 | lstep1[57] = _mm_add_epi32(lstep3[59], lstep2[57]); |
michael@0 | 2179 | lstep1[58] = _mm_sub_epi32(lstep2[56], lstep3[58]); |
michael@0 | 2180 | lstep1[59] = _mm_sub_epi32(lstep2[57], lstep3[59]); |
michael@0 | 2181 | lstep1[60] = _mm_sub_epi32(lstep2[62], lstep3[60]); |
michael@0 | 2182 | lstep1[61] = _mm_sub_epi32(lstep2[63], lstep3[61]); |
michael@0 | 2183 | lstep1[62] = _mm_add_epi32(lstep3[60], lstep2[62]); |
michael@0 | 2184 | lstep1[63] = _mm_add_epi32(lstep3[61], lstep2[63]); |
michael@0 | 2185 | } |
michael@0 | 2186 | // stage 8 |
michael@0 | 2187 | { |
michael@0 | 2188 | const __m128i k32_p31_p01 = pair_set_epi32(cospi_31_64, cospi_1_64); |
michael@0 | 2189 | const __m128i k32_p15_p17 = pair_set_epi32(cospi_15_64, cospi_17_64); |
michael@0 | 2190 | const __m128i k32_p23_p09 = pair_set_epi32(cospi_23_64, cospi_9_64); |
michael@0 | 2191 | const __m128i k32_p07_p25 = pair_set_epi32(cospi_7_64, cospi_25_64); |
michael@0 | 2192 | const __m128i k32_m25_p07 = pair_set_epi32(-cospi_25_64, cospi_7_64); |
michael@0 | 2193 | const __m128i k32_m09_p23 = pair_set_epi32(-cospi_9_64, cospi_23_64); |
michael@0 | 2194 | const __m128i k32_m17_p15 = pair_set_epi32(-cospi_17_64, cospi_15_64); |
michael@0 | 2195 | const __m128i k32_m01_p31 = pair_set_epi32(-cospi_1_64, cospi_31_64); |
michael@0 | 2196 | |
michael@0 | 2197 | u[ 0] = _mm_unpacklo_epi32(lstep1[32], lstep1[62]); |
michael@0 | 2198 | u[ 1] = _mm_unpackhi_epi32(lstep1[32], lstep1[62]); |
michael@0 | 2199 | u[ 2] = _mm_unpacklo_epi32(lstep1[33], lstep1[63]); |
michael@0 | 2200 | u[ 3] = _mm_unpackhi_epi32(lstep1[33], lstep1[63]); |
michael@0 | 2201 | u[ 4] = _mm_unpacklo_epi32(lstep1[34], lstep1[60]); |
michael@0 | 2202 | u[ 5] = _mm_unpackhi_epi32(lstep1[34], lstep1[60]); |
michael@0 | 2203 | u[ 6] = _mm_unpacklo_epi32(lstep1[35], lstep1[61]); |
michael@0 | 2204 | u[ 7] = _mm_unpackhi_epi32(lstep1[35], lstep1[61]); |
michael@0 | 2205 | u[ 8] = _mm_unpacklo_epi32(lstep1[36], lstep1[58]); |
michael@0 | 2206 | u[ 9] = _mm_unpackhi_epi32(lstep1[36], lstep1[58]); |
michael@0 | 2207 | u[10] = _mm_unpacklo_epi32(lstep1[37], lstep1[59]); |
michael@0 | 2208 | u[11] = _mm_unpackhi_epi32(lstep1[37], lstep1[59]); |
michael@0 | 2209 | u[12] = _mm_unpacklo_epi32(lstep1[38], lstep1[56]); |
michael@0 | 2210 | u[13] = _mm_unpackhi_epi32(lstep1[38], lstep1[56]); |
michael@0 | 2211 | u[14] = _mm_unpacklo_epi32(lstep1[39], lstep1[57]); |
michael@0 | 2212 | u[15] = _mm_unpackhi_epi32(lstep1[39], lstep1[57]); |
michael@0 | 2213 | |
michael@0 | 2214 | v[ 0] = k_madd_epi32(u[ 0], k32_p31_p01); |
michael@0 | 2215 | v[ 1] = k_madd_epi32(u[ 1], k32_p31_p01); |
michael@0 | 2216 | v[ 2] = k_madd_epi32(u[ 2], k32_p31_p01); |
michael@0 | 2217 | v[ 3] = k_madd_epi32(u[ 3], k32_p31_p01); |
michael@0 | 2218 | v[ 4] = k_madd_epi32(u[ 4], k32_p15_p17); |
michael@0 | 2219 | v[ 5] = k_madd_epi32(u[ 5], k32_p15_p17); |
michael@0 | 2220 | v[ 6] = k_madd_epi32(u[ 6], k32_p15_p17); |
michael@0 | 2221 | v[ 7] = k_madd_epi32(u[ 7], k32_p15_p17); |
michael@0 | 2222 | v[ 8] = k_madd_epi32(u[ 8], k32_p23_p09); |
michael@0 | 2223 | v[ 9] = k_madd_epi32(u[ 9], k32_p23_p09); |
michael@0 | 2224 | v[10] = k_madd_epi32(u[10], k32_p23_p09); |
michael@0 | 2225 | v[11] = k_madd_epi32(u[11], k32_p23_p09); |
michael@0 | 2226 | v[12] = k_madd_epi32(u[12], k32_p07_p25); |
michael@0 | 2227 | v[13] = k_madd_epi32(u[13], k32_p07_p25); |
michael@0 | 2228 | v[14] = k_madd_epi32(u[14], k32_p07_p25); |
michael@0 | 2229 | v[15] = k_madd_epi32(u[15], k32_p07_p25); |
michael@0 | 2230 | v[16] = k_madd_epi32(u[12], k32_m25_p07); |
michael@0 | 2231 | v[17] = k_madd_epi32(u[13], k32_m25_p07); |
michael@0 | 2232 | v[18] = k_madd_epi32(u[14], k32_m25_p07); |
michael@0 | 2233 | v[19] = k_madd_epi32(u[15], k32_m25_p07); |
michael@0 | 2234 | v[20] = k_madd_epi32(u[ 8], k32_m09_p23); |
michael@0 | 2235 | v[21] = k_madd_epi32(u[ 9], k32_m09_p23); |
michael@0 | 2236 | v[22] = k_madd_epi32(u[10], k32_m09_p23); |
michael@0 | 2237 | v[23] = k_madd_epi32(u[11], k32_m09_p23); |
michael@0 | 2238 | v[24] = k_madd_epi32(u[ 4], k32_m17_p15); |
michael@0 | 2239 | v[25] = k_madd_epi32(u[ 5], k32_m17_p15); |
michael@0 | 2240 | v[26] = k_madd_epi32(u[ 6], k32_m17_p15); |
michael@0 | 2241 | v[27] = k_madd_epi32(u[ 7], k32_m17_p15); |
michael@0 | 2242 | v[28] = k_madd_epi32(u[ 0], k32_m01_p31); |
michael@0 | 2243 | v[29] = k_madd_epi32(u[ 1], k32_m01_p31); |
michael@0 | 2244 | v[30] = k_madd_epi32(u[ 2], k32_m01_p31); |
michael@0 | 2245 | v[31] = k_madd_epi32(u[ 3], k32_m01_p31); |
michael@0 | 2246 | |
michael@0 | 2247 | u[ 0] = k_packs_epi64(v[ 0], v[ 1]); |
michael@0 | 2248 | u[ 1] = k_packs_epi64(v[ 2], v[ 3]); |
michael@0 | 2249 | u[ 2] = k_packs_epi64(v[ 4], v[ 5]); |
michael@0 | 2250 | u[ 3] = k_packs_epi64(v[ 6], v[ 7]); |
michael@0 | 2251 | u[ 4] = k_packs_epi64(v[ 8], v[ 9]); |
michael@0 | 2252 | u[ 5] = k_packs_epi64(v[10], v[11]); |
michael@0 | 2253 | u[ 6] = k_packs_epi64(v[12], v[13]); |
michael@0 | 2254 | u[ 7] = k_packs_epi64(v[14], v[15]); |
michael@0 | 2255 | u[ 8] = k_packs_epi64(v[16], v[17]); |
michael@0 | 2256 | u[ 9] = k_packs_epi64(v[18], v[19]); |
michael@0 | 2257 | u[10] = k_packs_epi64(v[20], v[21]); |
michael@0 | 2258 | u[11] = k_packs_epi64(v[22], v[23]); |
michael@0 | 2259 | u[12] = k_packs_epi64(v[24], v[25]); |
michael@0 | 2260 | u[13] = k_packs_epi64(v[26], v[27]); |
michael@0 | 2261 | u[14] = k_packs_epi64(v[28], v[29]); |
michael@0 | 2262 | u[15] = k_packs_epi64(v[30], v[31]); |
michael@0 | 2263 | |
michael@0 | 2264 | v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); |
michael@0 | 2265 | v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); |
michael@0 | 2266 | v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); |
michael@0 | 2267 | v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); |
michael@0 | 2268 | v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); |
michael@0 | 2269 | v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); |
michael@0 | 2270 | v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); |
michael@0 | 2271 | v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); |
michael@0 | 2272 | v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); |
michael@0 | 2273 | v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); |
michael@0 | 2274 | v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); |
michael@0 | 2275 | v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); |
michael@0 | 2276 | v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); |
michael@0 | 2277 | v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); |
michael@0 | 2278 | v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); |
michael@0 | 2279 | v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); |
michael@0 | 2280 | |
michael@0 | 2281 | u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); |
michael@0 | 2282 | u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); |
michael@0 | 2283 | u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); |
michael@0 | 2284 | u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); |
michael@0 | 2285 | u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); |
michael@0 | 2286 | u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); |
michael@0 | 2287 | u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); |
michael@0 | 2288 | u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); |
michael@0 | 2289 | u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); |
michael@0 | 2290 | u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); |
michael@0 | 2291 | u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); |
michael@0 | 2292 | u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); |
michael@0 | 2293 | u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); |
michael@0 | 2294 | u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); |
michael@0 | 2295 | u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); |
michael@0 | 2296 | u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); |
michael@0 | 2297 | |
michael@0 | 2298 | v[ 0] = _mm_cmplt_epi32(u[ 0], kZero); |
michael@0 | 2299 | v[ 1] = _mm_cmplt_epi32(u[ 1], kZero); |
michael@0 | 2300 | v[ 2] = _mm_cmplt_epi32(u[ 2], kZero); |
michael@0 | 2301 | v[ 3] = _mm_cmplt_epi32(u[ 3], kZero); |
michael@0 | 2302 | v[ 4] = _mm_cmplt_epi32(u[ 4], kZero); |
michael@0 | 2303 | v[ 5] = _mm_cmplt_epi32(u[ 5], kZero); |
michael@0 | 2304 | v[ 6] = _mm_cmplt_epi32(u[ 6], kZero); |
michael@0 | 2305 | v[ 7] = _mm_cmplt_epi32(u[ 7], kZero); |
michael@0 | 2306 | v[ 8] = _mm_cmplt_epi32(u[ 8], kZero); |
michael@0 | 2307 | v[ 9] = _mm_cmplt_epi32(u[ 9], kZero); |
michael@0 | 2308 | v[10] = _mm_cmplt_epi32(u[10], kZero); |
michael@0 | 2309 | v[11] = _mm_cmplt_epi32(u[11], kZero); |
michael@0 | 2310 | v[12] = _mm_cmplt_epi32(u[12], kZero); |
michael@0 | 2311 | v[13] = _mm_cmplt_epi32(u[13], kZero); |
michael@0 | 2312 | v[14] = _mm_cmplt_epi32(u[14], kZero); |
michael@0 | 2313 | v[15] = _mm_cmplt_epi32(u[15], kZero); |
michael@0 | 2314 | |
michael@0 | 2315 | u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]); |
michael@0 | 2316 | u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]); |
michael@0 | 2317 | u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]); |
michael@0 | 2318 | u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]); |
michael@0 | 2319 | u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]); |
michael@0 | 2320 | u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]); |
michael@0 | 2321 | u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]); |
michael@0 | 2322 | u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]); |
michael@0 | 2323 | u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]); |
michael@0 | 2324 | u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]); |
michael@0 | 2325 | u[10] = _mm_sub_epi32(u[10], v[10]); |
michael@0 | 2326 | u[11] = _mm_sub_epi32(u[11], v[11]); |
michael@0 | 2327 | u[12] = _mm_sub_epi32(u[12], v[12]); |
michael@0 | 2328 | u[13] = _mm_sub_epi32(u[13], v[13]); |
michael@0 | 2329 | u[14] = _mm_sub_epi32(u[14], v[14]); |
michael@0 | 2330 | u[15] = _mm_sub_epi32(u[15], v[15]); |
michael@0 | 2331 | |
michael@0 | 2332 | v[0] = _mm_add_epi32(u[0], K32One); |
michael@0 | 2333 | v[1] = _mm_add_epi32(u[1], K32One); |
michael@0 | 2334 | v[2] = _mm_add_epi32(u[2], K32One); |
michael@0 | 2335 | v[3] = _mm_add_epi32(u[3], K32One); |
michael@0 | 2336 | v[4] = _mm_add_epi32(u[4], K32One); |
michael@0 | 2337 | v[5] = _mm_add_epi32(u[5], K32One); |
michael@0 | 2338 | v[6] = _mm_add_epi32(u[6], K32One); |
michael@0 | 2339 | v[7] = _mm_add_epi32(u[7], K32One); |
michael@0 | 2340 | v[8] = _mm_add_epi32(u[8], K32One); |
michael@0 | 2341 | v[9] = _mm_add_epi32(u[9], K32One); |
michael@0 | 2342 | v[10] = _mm_add_epi32(u[10], K32One); |
michael@0 | 2343 | v[11] = _mm_add_epi32(u[11], K32One); |
michael@0 | 2344 | v[12] = _mm_add_epi32(u[12], K32One); |
michael@0 | 2345 | v[13] = _mm_add_epi32(u[13], K32One); |
michael@0 | 2346 | v[14] = _mm_add_epi32(u[14], K32One); |
michael@0 | 2347 | v[15] = _mm_add_epi32(u[15], K32One); |
michael@0 | 2348 | |
michael@0 | 2349 | u[0] = _mm_srai_epi32(v[0], 2); |
michael@0 | 2350 | u[1] = _mm_srai_epi32(v[1], 2); |
michael@0 | 2351 | u[2] = _mm_srai_epi32(v[2], 2); |
michael@0 | 2352 | u[3] = _mm_srai_epi32(v[3], 2); |
michael@0 | 2353 | u[4] = _mm_srai_epi32(v[4], 2); |
michael@0 | 2354 | u[5] = _mm_srai_epi32(v[5], 2); |
michael@0 | 2355 | u[6] = _mm_srai_epi32(v[6], 2); |
michael@0 | 2356 | u[7] = _mm_srai_epi32(v[7], 2); |
michael@0 | 2357 | u[8] = _mm_srai_epi32(v[8], 2); |
michael@0 | 2358 | u[9] = _mm_srai_epi32(v[9], 2); |
michael@0 | 2359 | u[10] = _mm_srai_epi32(v[10], 2); |
michael@0 | 2360 | u[11] = _mm_srai_epi32(v[11], 2); |
michael@0 | 2361 | u[12] = _mm_srai_epi32(v[12], 2); |
michael@0 | 2362 | u[13] = _mm_srai_epi32(v[13], 2); |
michael@0 | 2363 | u[14] = _mm_srai_epi32(v[14], 2); |
michael@0 | 2364 | u[15] = _mm_srai_epi32(v[15], 2); |
michael@0 | 2365 | |
michael@0 | 2366 | out[ 1] = _mm_packs_epi32(u[0], u[1]); |
michael@0 | 2367 | out[17] = _mm_packs_epi32(u[2], u[3]); |
michael@0 | 2368 | out[ 9] = _mm_packs_epi32(u[4], u[5]); |
michael@0 | 2369 | out[25] = _mm_packs_epi32(u[6], u[7]); |
michael@0 | 2370 | out[ 7] = _mm_packs_epi32(u[8], u[9]); |
michael@0 | 2371 | out[23] = _mm_packs_epi32(u[10], u[11]); |
michael@0 | 2372 | out[15] = _mm_packs_epi32(u[12], u[13]); |
michael@0 | 2373 | out[31] = _mm_packs_epi32(u[14], u[15]); |
michael@0 | 2374 | } |
michael@0 | 2375 | { |
michael@0 | 2376 | const __m128i k32_p27_p05 = pair_set_epi32(cospi_27_64, cospi_5_64); |
michael@0 | 2377 | const __m128i k32_p11_p21 = pair_set_epi32(cospi_11_64, cospi_21_64); |
michael@0 | 2378 | const __m128i k32_p19_p13 = pair_set_epi32(cospi_19_64, cospi_13_64); |
michael@0 | 2379 | const __m128i k32_p03_p29 = pair_set_epi32(cospi_3_64, cospi_29_64); |
michael@0 | 2380 | const __m128i k32_m29_p03 = pair_set_epi32(-cospi_29_64, cospi_3_64); |
michael@0 | 2381 | const __m128i k32_m13_p19 = pair_set_epi32(-cospi_13_64, cospi_19_64); |
michael@0 | 2382 | const __m128i k32_m21_p11 = pair_set_epi32(-cospi_21_64, cospi_11_64); |
michael@0 | 2383 | const __m128i k32_m05_p27 = pair_set_epi32(-cospi_5_64, cospi_27_64); |
michael@0 | 2384 | |
michael@0 | 2385 | u[ 0] = _mm_unpacklo_epi32(lstep1[40], lstep1[54]); |
michael@0 | 2386 | u[ 1] = _mm_unpackhi_epi32(lstep1[40], lstep1[54]); |
michael@0 | 2387 | u[ 2] = _mm_unpacklo_epi32(lstep1[41], lstep1[55]); |
michael@0 | 2388 | u[ 3] = _mm_unpackhi_epi32(lstep1[41], lstep1[55]); |
michael@0 | 2389 | u[ 4] = _mm_unpacklo_epi32(lstep1[42], lstep1[52]); |
michael@0 | 2390 | u[ 5] = _mm_unpackhi_epi32(lstep1[42], lstep1[52]); |
michael@0 | 2391 | u[ 6] = _mm_unpacklo_epi32(lstep1[43], lstep1[53]); |
michael@0 | 2392 | u[ 7] = _mm_unpackhi_epi32(lstep1[43], lstep1[53]); |
michael@0 | 2393 | u[ 8] = _mm_unpacklo_epi32(lstep1[44], lstep1[50]); |
michael@0 | 2394 | u[ 9] = _mm_unpackhi_epi32(lstep1[44], lstep1[50]); |
michael@0 | 2395 | u[10] = _mm_unpacklo_epi32(lstep1[45], lstep1[51]); |
michael@0 | 2396 | u[11] = _mm_unpackhi_epi32(lstep1[45], lstep1[51]); |
michael@0 | 2397 | u[12] = _mm_unpacklo_epi32(lstep1[46], lstep1[48]); |
michael@0 | 2398 | u[13] = _mm_unpackhi_epi32(lstep1[46], lstep1[48]); |
michael@0 | 2399 | u[14] = _mm_unpacklo_epi32(lstep1[47], lstep1[49]); |
michael@0 | 2400 | u[15] = _mm_unpackhi_epi32(lstep1[47], lstep1[49]); |
michael@0 | 2401 | |
michael@0 | 2402 | v[ 0] = k_madd_epi32(u[ 0], k32_p27_p05); |
michael@0 | 2403 | v[ 1] = k_madd_epi32(u[ 1], k32_p27_p05); |
michael@0 | 2404 | v[ 2] = k_madd_epi32(u[ 2], k32_p27_p05); |
michael@0 | 2405 | v[ 3] = k_madd_epi32(u[ 3], k32_p27_p05); |
michael@0 | 2406 | v[ 4] = k_madd_epi32(u[ 4], k32_p11_p21); |
michael@0 | 2407 | v[ 5] = k_madd_epi32(u[ 5], k32_p11_p21); |
michael@0 | 2408 | v[ 6] = k_madd_epi32(u[ 6], k32_p11_p21); |
michael@0 | 2409 | v[ 7] = k_madd_epi32(u[ 7], k32_p11_p21); |
michael@0 | 2410 | v[ 8] = k_madd_epi32(u[ 8], k32_p19_p13); |
michael@0 | 2411 | v[ 9] = k_madd_epi32(u[ 9], k32_p19_p13); |
michael@0 | 2412 | v[10] = k_madd_epi32(u[10], k32_p19_p13); |
michael@0 | 2413 | v[11] = k_madd_epi32(u[11], k32_p19_p13); |
michael@0 | 2414 | v[12] = k_madd_epi32(u[12], k32_p03_p29); |
michael@0 | 2415 | v[13] = k_madd_epi32(u[13], k32_p03_p29); |
michael@0 | 2416 | v[14] = k_madd_epi32(u[14], k32_p03_p29); |
michael@0 | 2417 | v[15] = k_madd_epi32(u[15], k32_p03_p29); |
michael@0 | 2418 | v[16] = k_madd_epi32(u[12], k32_m29_p03); |
michael@0 | 2419 | v[17] = k_madd_epi32(u[13], k32_m29_p03); |
michael@0 | 2420 | v[18] = k_madd_epi32(u[14], k32_m29_p03); |
michael@0 | 2421 | v[19] = k_madd_epi32(u[15], k32_m29_p03); |
michael@0 | 2422 | v[20] = k_madd_epi32(u[ 8], k32_m13_p19); |
michael@0 | 2423 | v[21] = k_madd_epi32(u[ 9], k32_m13_p19); |
michael@0 | 2424 | v[22] = k_madd_epi32(u[10], k32_m13_p19); |
michael@0 | 2425 | v[23] = k_madd_epi32(u[11], k32_m13_p19); |
michael@0 | 2426 | v[24] = k_madd_epi32(u[ 4], k32_m21_p11); |
michael@0 | 2427 | v[25] = k_madd_epi32(u[ 5], k32_m21_p11); |
michael@0 | 2428 | v[26] = k_madd_epi32(u[ 6], k32_m21_p11); |
michael@0 | 2429 | v[27] = k_madd_epi32(u[ 7], k32_m21_p11); |
michael@0 | 2430 | v[28] = k_madd_epi32(u[ 0], k32_m05_p27); |
michael@0 | 2431 | v[29] = k_madd_epi32(u[ 1], k32_m05_p27); |
michael@0 | 2432 | v[30] = k_madd_epi32(u[ 2], k32_m05_p27); |
michael@0 | 2433 | v[31] = k_madd_epi32(u[ 3], k32_m05_p27); |
michael@0 | 2434 | |
michael@0 | 2435 | u[ 0] = k_packs_epi64(v[ 0], v[ 1]); |
michael@0 | 2436 | u[ 1] = k_packs_epi64(v[ 2], v[ 3]); |
michael@0 | 2437 | u[ 2] = k_packs_epi64(v[ 4], v[ 5]); |
michael@0 | 2438 | u[ 3] = k_packs_epi64(v[ 6], v[ 7]); |
michael@0 | 2439 | u[ 4] = k_packs_epi64(v[ 8], v[ 9]); |
michael@0 | 2440 | u[ 5] = k_packs_epi64(v[10], v[11]); |
michael@0 | 2441 | u[ 6] = k_packs_epi64(v[12], v[13]); |
michael@0 | 2442 | u[ 7] = k_packs_epi64(v[14], v[15]); |
michael@0 | 2443 | u[ 8] = k_packs_epi64(v[16], v[17]); |
michael@0 | 2444 | u[ 9] = k_packs_epi64(v[18], v[19]); |
michael@0 | 2445 | u[10] = k_packs_epi64(v[20], v[21]); |
michael@0 | 2446 | u[11] = k_packs_epi64(v[22], v[23]); |
michael@0 | 2447 | u[12] = k_packs_epi64(v[24], v[25]); |
michael@0 | 2448 | u[13] = k_packs_epi64(v[26], v[27]); |
michael@0 | 2449 | u[14] = k_packs_epi64(v[28], v[29]); |
michael@0 | 2450 | u[15] = k_packs_epi64(v[30], v[31]); |
michael@0 | 2451 | |
michael@0 | 2452 | v[ 0] = _mm_add_epi32(u[ 0], k__DCT_CONST_ROUNDING); |
michael@0 | 2453 | v[ 1] = _mm_add_epi32(u[ 1], k__DCT_CONST_ROUNDING); |
michael@0 | 2454 | v[ 2] = _mm_add_epi32(u[ 2], k__DCT_CONST_ROUNDING); |
michael@0 | 2455 | v[ 3] = _mm_add_epi32(u[ 3], k__DCT_CONST_ROUNDING); |
michael@0 | 2456 | v[ 4] = _mm_add_epi32(u[ 4], k__DCT_CONST_ROUNDING); |
michael@0 | 2457 | v[ 5] = _mm_add_epi32(u[ 5], k__DCT_CONST_ROUNDING); |
michael@0 | 2458 | v[ 6] = _mm_add_epi32(u[ 6], k__DCT_CONST_ROUNDING); |
michael@0 | 2459 | v[ 7] = _mm_add_epi32(u[ 7], k__DCT_CONST_ROUNDING); |
michael@0 | 2460 | v[ 8] = _mm_add_epi32(u[ 8], k__DCT_CONST_ROUNDING); |
michael@0 | 2461 | v[ 9] = _mm_add_epi32(u[ 9], k__DCT_CONST_ROUNDING); |
michael@0 | 2462 | v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); |
michael@0 | 2463 | v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); |
michael@0 | 2464 | v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); |
michael@0 | 2465 | v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); |
michael@0 | 2466 | v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); |
michael@0 | 2467 | v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); |
michael@0 | 2468 | |
michael@0 | 2469 | u[ 0] = _mm_srai_epi32(v[ 0], DCT_CONST_BITS); |
michael@0 | 2470 | u[ 1] = _mm_srai_epi32(v[ 1], DCT_CONST_BITS); |
michael@0 | 2471 | u[ 2] = _mm_srai_epi32(v[ 2], DCT_CONST_BITS); |
michael@0 | 2472 | u[ 3] = _mm_srai_epi32(v[ 3], DCT_CONST_BITS); |
michael@0 | 2473 | u[ 4] = _mm_srai_epi32(v[ 4], DCT_CONST_BITS); |
michael@0 | 2474 | u[ 5] = _mm_srai_epi32(v[ 5], DCT_CONST_BITS); |
michael@0 | 2475 | u[ 6] = _mm_srai_epi32(v[ 6], DCT_CONST_BITS); |
michael@0 | 2476 | u[ 7] = _mm_srai_epi32(v[ 7], DCT_CONST_BITS); |
michael@0 | 2477 | u[ 8] = _mm_srai_epi32(v[ 8], DCT_CONST_BITS); |
michael@0 | 2478 | u[ 9] = _mm_srai_epi32(v[ 9], DCT_CONST_BITS); |
michael@0 | 2479 | u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); |
michael@0 | 2480 | u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); |
michael@0 | 2481 | u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); |
michael@0 | 2482 | u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); |
michael@0 | 2483 | u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); |
michael@0 | 2484 | u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); |
michael@0 | 2485 | |
michael@0 | 2486 | v[ 0] = _mm_cmplt_epi32(u[ 0], kZero); |
michael@0 | 2487 | v[ 1] = _mm_cmplt_epi32(u[ 1], kZero); |
michael@0 | 2488 | v[ 2] = _mm_cmplt_epi32(u[ 2], kZero); |
michael@0 | 2489 | v[ 3] = _mm_cmplt_epi32(u[ 3], kZero); |
michael@0 | 2490 | v[ 4] = _mm_cmplt_epi32(u[ 4], kZero); |
michael@0 | 2491 | v[ 5] = _mm_cmplt_epi32(u[ 5], kZero); |
michael@0 | 2492 | v[ 6] = _mm_cmplt_epi32(u[ 6], kZero); |
michael@0 | 2493 | v[ 7] = _mm_cmplt_epi32(u[ 7], kZero); |
michael@0 | 2494 | v[ 8] = _mm_cmplt_epi32(u[ 8], kZero); |
michael@0 | 2495 | v[ 9] = _mm_cmplt_epi32(u[ 9], kZero); |
michael@0 | 2496 | v[10] = _mm_cmplt_epi32(u[10], kZero); |
michael@0 | 2497 | v[11] = _mm_cmplt_epi32(u[11], kZero); |
michael@0 | 2498 | v[12] = _mm_cmplt_epi32(u[12], kZero); |
michael@0 | 2499 | v[13] = _mm_cmplt_epi32(u[13], kZero); |
michael@0 | 2500 | v[14] = _mm_cmplt_epi32(u[14], kZero); |
michael@0 | 2501 | v[15] = _mm_cmplt_epi32(u[15], kZero); |
michael@0 | 2502 | |
michael@0 | 2503 | u[ 0] = _mm_sub_epi32(u[ 0], v[ 0]); |
michael@0 | 2504 | u[ 1] = _mm_sub_epi32(u[ 1], v[ 1]); |
michael@0 | 2505 | u[ 2] = _mm_sub_epi32(u[ 2], v[ 2]); |
michael@0 | 2506 | u[ 3] = _mm_sub_epi32(u[ 3], v[ 3]); |
michael@0 | 2507 | u[ 4] = _mm_sub_epi32(u[ 4], v[ 4]); |
michael@0 | 2508 | u[ 5] = _mm_sub_epi32(u[ 5], v[ 5]); |
michael@0 | 2509 | u[ 6] = _mm_sub_epi32(u[ 6], v[ 6]); |
michael@0 | 2510 | u[ 7] = _mm_sub_epi32(u[ 7], v[ 7]); |
michael@0 | 2511 | u[ 8] = _mm_sub_epi32(u[ 8], v[ 8]); |
michael@0 | 2512 | u[ 9] = _mm_sub_epi32(u[ 9], v[ 9]); |
michael@0 | 2513 | u[10] = _mm_sub_epi32(u[10], v[10]); |
michael@0 | 2514 | u[11] = _mm_sub_epi32(u[11], v[11]); |
michael@0 | 2515 | u[12] = _mm_sub_epi32(u[12], v[12]); |
michael@0 | 2516 | u[13] = _mm_sub_epi32(u[13], v[13]); |
michael@0 | 2517 | u[14] = _mm_sub_epi32(u[14], v[14]); |
michael@0 | 2518 | u[15] = _mm_sub_epi32(u[15], v[15]); |
michael@0 | 2519 | |
michael@0 | 2520 | v[0] = _mm_add_epi32(u[0], K32One); |
michael@0 | 2521 | v[1] = _mm_add_epi32(u[1], K32One); |
michael@0 | 2522 | v[2] = _mm_add_epi32(u[2], K32One); |
michael@0 | 2523 | v[3] = _mm_add_epi32(u[3], K32One); |
michael@0 | 2524 | v[4] = _mm_add_epi32(u[4], K32One); |
michael@0 | 2525 | v[5] = _mm_add_epi32(u[5], K32One); |
michael@0 | 2526 | v[6] = _mm_add_epi32(u[6], K32One); |
michael@0 | 2527 | v[7] = _mm_add_epi32(u[7], K32One); |
michael@0 | 2528 | v[8] = _mm_add_epi32(u[8], K32One); |
michael@0 | 2529 | v[9] = _mm_add_epi32(u[9], K32One); |
michael@0 | 2530 | v[10] = _mm_add_epi32(u[10], K32One); |
michael@0 | 2531 | v[11] = _mm_add_epi32(u[11], K32One); |
michael@0 | 2532 | v[12] = _mm_add_epi32(u[12], K32One); |
michael@0 | 2533 | v[13] = _mm_add_epi32(u[13], K32One); |
michael@0 | 2534 | v[14] = _mm_add_epi32(u[14], K32One); |
michael@0 | 2535 | v[15] = _mm_add_epi32(u[15], K32One); |
michael@0 | 2536 | |
michael@0 | 2537 | u[0] = _mm_srai_epi32(v[0], 2); |
michael@0 | 2538 | u[1] = _mm_srai_epi32(v[1], 2); |
michael@0 | 2539 | u[2] = _mm_srai_epi32(v[2], 2); |
michael@0 | 2540 | u[3] = _mm_srai_epi32(v[3], 2); |
michael@0 | 2541 | u[4] = _mm_srai_epi32(v[4], 2); |
michael@0 | 2542 | u[5] = _mm_srai_epi32(v[5], 2); |
michael@0 | 2543 | u[6] = _mm_srai_epi32(v[6], 2); |
michael@0 | 2544 | u[7] = _mm_srai_epi32(v[7], 2); |
michael@0 | 2545 | u[8] = _mm_srai_epi32(v[8], 2); |
michael@0 | 2546 | u[9] = _mm_srai_epi32(v[9], 2); |
michael@0 | 2547 | u[10] = _mm_srai_epi32(v[10], 2); |
michael@0 | 2548 | u[11] = _mm_srai_epi32(v[11], 2); |
michael@0 | 2549 | u[12] = _mm_srai_epi32(v[12], 2); |
michael@0 | 2550 | u[13] = _mm_srai_epi32(v[13], 2); |
michael@0 | 2551 | u[14] = _mm_srai_epi32(v[14], 2); |
michael@0 | 2552 | u[15] = _mm_srai_epi32(v[15], 2); |
michael@0 | 2553 | |
michael@0 | 2554 | out[ 5] = _mm_packs_epi32(u[0], u[1]); |
michael@0 | 2555 | out[21] = _mm_packs_epi32(u[2], u[3]); |
michael@0 | 2556 | out[13] = _mm_packs_epi32(u[4], u[5]); |
michael@0 | 2557 | out[29] = _mm_packs_epi32(u[6], u[7]); |
michael@0 | 2558 | out[ 3] = _mm_packs_epi32(u[8], u[9]); |
michael@0 | 2559 | out[19] = _mm_packs_epi32(u[10], u[11]); |
michael@0 | 2560 | out[11] = _mm_packs_epi32(u[12], u[13]); |
michael@0 | 2561 | out[27] = _mm_packs_epi32(u[14], u[15]); |
michael@0 | 2562 | } |
michael@0 | 2563 | } |
michael@0 | 2564 | #endif |
michael@0 | 2565 | // Transpose the results, do it as four 8x8 transposes. |
michael@0 | 2566 | { |
michael@0 | 2567 | int transpose_block; |
michael@0 | 2568 | int16_t *output; |
michael@0 | 2569 | if (0 == pass) { |
michael@0 | 2570 | output = &intermediate[column_start * 32]; |
michael@0 | 2571 | } else { |
michael@0 | 2572 | output = &output_org[column_start * 32]; |
michael@0 | 2573 | } |
michael@0 | 2574 | for (transpose_block = 0; transpose_block < 4; ++transpose_block) { |
michael@0 | 2575 | __m128i *this_out = &out[8 * transpose_block]; |
michael@0 | 2576 | // 00 01 02 03 04 05 06 07 |
michael@0 | 2577 | // 10 11 12 13 14 15 16 17 |
michael@0 | 2578 | // 20 21 22 23 24 25 26 27 |
michael@0 | 2579 | // 30 31 32 33 34 35 36 37 |
michael@0 | 2580 | // 40 41 42 43 44 45 46 47 |
michael@0 | 2581 | // 50 51 52 53 54 55 56 57 |
michael@0 | 2582 | // 60 61 62 63 64 65 66 67 |
michael@0 | 2583 | // 70 71 72 73 74 75 76 77 |
michael@0 | 2584 | const __m128i tr0_0 = _mm_unpacklo_epi16(this_out[0], this_out[1]); |
michael@0 | 2585 | const __m128i tr0_1 = _mm_unpacklo_epi16(this_out[2], this_out[3]); |
michael@0 | 2586 | const __m128i tr0_2 = _mm_unpackhi_epi16(this_out[0], this_out[1]); |
michael@0 | 2587 | const __m128i tr0_3 = _mm_unpackhi_epi16(this_out[2], this_out[3]); |
michael@0 | 2588 | const __m128i tr0_4 = _mm_unpacklo_epi16(this_out[4], this_out[5]); |
michael@0 | 2589 | const __m128i tr0_5 = _mm_unpacklo_epi16(this_out[6], this_out[7]); |
michael@0 | 2590 | const __m128i tr0_6 = _mm_unpackhi_epi16(this_out[4], this_out[5]); |
michael@0 | 2591 | const __m128i tr0_7 = _mm_unpackhi_epi16(this_out[6], this_out[7]); |
michael@0 | 2592 | // 00 10 01 11 02 12 03 13 |
michael@0 | 2593 | // 20 30 21 31 22 32 23 33 |
michael@0 | 2594 | // 04 14 05 15 06 16 07 17 |
michael@0 | 2595 | // 24 34 25 35 26 36 27 37 |
michael@0 | 2596 | // 40 50 41 51 42 52 43 53 |
michael@0 | 2597 | // 60 70 61 71 62 72 63 73 |
michael@0 | 2598 | // 54 54 55 55 56 56 57 57 |
michael@0 | 2599 | // 64 74 65 75 66 76 67 77 |
michael@0 | 2600 | const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); |
michael@0 | 2601 | const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); |
michael@0 | 2602 | const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); |
michael@0 | 2603 | const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); |
michael@0 | 2604 | const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); |
michael@0 | 2605 | const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); |
michael@0 | 2606 | const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); |
michael@0 | 2607 | const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); |
michael@0 | 2608 | // 00 10 20 30 01 11 21 31 |
michael@0 | 2609 | // 40 50 60 70 41 51 61 71 |
michael@0 | 2610 | // 02 12 22 32 03 13 23 33 |
michael@0 | 2611 | // 42 52 62 72 43 53 63 73 |
michael@0 | 2612 | // 04 14 24 34 05 15 21 36 |
michael@0 | 2613 | // 44 54 64 74 45 55 61 76 |
michael@0 | 2614 | // 06 16 26 36 07 17 27 37 |
michael@0 | 2615 | // 46 56 66 76 47 57 67 77 |
michael@0 | 2616 | __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); |
michael@0 | 2617 | __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); |
michael@0 | 2618 | __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); |
michael@0 | 2619 | __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); |
michael@0 | 2620 | __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); |
michael@0 | 2621 | __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); |
michael@0 | 2622 | __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); |
michael@0 | 2623 | __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); |
michael@0 | 2624 | // 00 10 20 30 40 50 60 70 |
michael@0 | 2625 | // 01 11 21 31 41 51 61 71 |
michael@0 | 2626 | // 02 12 22 32 42 52 62 72 |
michael@0 | 2627 | // 03 13 23 33 43 53 63 73 |
michael@0 | 2628 | // 04 14 24 34 44 54 64 74 |
michael@0 | 2629 | // 05 15 25 35 45 55 65 75 |
michael@0 | 2630 | // 06 16 26 36 46 56 66 76 |
michael@0 | 2631 | // 07 17 27 37 47 57 67 77 |
michael@0 | 2632 | if (0 == pass) { |
michael@0 | 2633 | // output[j] = (output[j] + 1 + (output[j] > 0)) >> 2; |
michael@0 | 2634 | // TODO(cd): see quality impact of only doing |
michael@0 | 2635 | // output[j] = (output[j] + 1) >> 2; |
michael@0 | 2636 | // which would remove the code between here ... |
michael@0 | 2637 | __m128i tr2_0_0 = _mm_cmpgt_epi16(tr2_0, kZero); |
michael@0 | 2638 | __m128i tr2_1_0 = _mm_cmpgt_epi16(tr2_1, kZero); |
michael@0 | 2639 | __m128i tr2_2_0 = _mm_cmpgt_epi16(tr2_2, kZero); |
michael@0 | 2640 | __m128i tr2_3_0 = _mm_cmpgt_epi16(tr2_3, kZero); |
michael@0 | 2641 | __m128i tr2_4_0 = _mm_cmpgt_epi16(tr2_4, kZero); |
michael@0 | 2642 | __m128i tr2_5_0 = _mm_cmpgt_epi16(tr2_5, kZero); |
michael@0 | 2643 | __m128i tr2_6_0 = _mm_cmpgt_epi16(tr2_6, kZero); |
michael@0 | 2644 | __m128i tr2_7_0 = _mm_cmpgt_epi16(tr2_7, kZero); |
michael@0 | 2645 | tr2_0 = _mm_sub_epi16(tr2_0, tr2_0_0); |
michael@0 | 2646 | tr2_1 = _mm_sub_epi16(tr2_1, tr2_1_0); |
michael@0 | 2647 | tr2_2 = _mm_sub_epi16(tr2_2, tr2_2_0); |
michael@0 | 2648 | tr2_3 = _mm_sub_epi16(tr2_3, tr2_3_0); |
michael@0 | 2649 | tr2_4 = _mm_sub_epi16(tr2_4, tr2_4_0); |
michael@0 | 2650 | tr2_5 = _mm_sub_epi16(tr2_5, tr2_5_0); |
michael@0 | 2651 | tr2_6 = _mm_sub_epi16(tr2_6, tr2_6_0); |
michael@0 | 2652 | tr2_7 = _mm_sub_epi16(tr2_7, tr2_7_0); |
michael@0 | 2653 | // ... and here. |
michael@0 | 2654 | // PS: also change code in vp9/encoder/vp9_dct.c |
michael@0 | 2655 | tr2_0 = _mm_add_epi16(tr2_0, kOne); |
michael@0 | 2656 | tr2_1 = _mm_add_epi16(tr2_1, kOne); |
michael@0 | 2657 | tr2_2 = _mm_add_epi16(tr2_2, kOne); |
michael@0 | 2658 | tr2_3 = _mm_add_epi16(tr2_3, kOne); |
michael@0 | 2659 | tr2_4 = _mm_add_epi16(tr2_4, kOne); |
michael@0 | 2660 | tr2_5 = _mm_add_epi16(tr2_5, kOne); |
michael@0 | 2661 | tr2_6 = _mm_add_epi16(tr2_6, kOne); |
michael@0 | 2662 | tr2_7 = _mm_add_epi16(tr2_7, kOne); |
michael@0 | 2663 | tr2_0 = _mm_srai_epi16(tr2_0, 2); |
michael@0 | 2664 | tr2_1 = _mm_srai_epi16(tr2_1, 2); |
michael@0 | 2665 | tr2_2 = _mm_srai_epi16(tr2_2, 2); |
michael@0 | 2666 | tr2_3 = _mm_srai_epi16(tr2_3, 2); |
michael@0 | 2667 | tr2_4 = _mm_srai_epi16(tr2_4, 2); |
michael@0 | 2668 | tr2_5 = _mm_srai_epi16(tr2_5, 2); |
michael@0 | 2669 | tr2_6 = _mm_srai_epi16(tr2_6, 2); |
michael@0 | 2670 | tr2_7 = _mm_srai_epi16(tr2_7, 2); |
michael@0 | 2671 | } |
michael@0 | 2672 | // Note: even though all these stores are aligned, using the aligned |
michael@0 | 2673 | // intrinsic make the code slightly slower. |
michael@0 | 2674 | _mm_storeu_si128((__m128i *)(output + 0 * 32), tr2_0); |
michael@0 | 2675 | _mm_storeu_si128((__m128i *)(output + 1 * 32), tr2_1); |
michael@0 | 2676 | _mm_storeu_si128((__m128i *)(output + 2 * 32), tr2_2); |
michael@0 | 2677 | _mm_storeu_si128((__m128i *)(output + 3 * 32), tr2_3); |
michael@0 | 2678 | _mm_storeu_si128((__m128i *)(output + 4 * 32), tr2_4); |
michael@0 | 2679 | _mm_storeu_si128((__m128i *)(output + 5 * 32), tr2_5); |
michael@0 | 2680 | _mm_storeu_si128((__m128i *)(output + 6 * 32), tr2_6); |
michael@0 | 2681 | _mm_storeu_si128((__m128i *)(output + 7 * 32), tr2_7); |
michael@0 | 2682 | // Process next 8x8 |
michael@0 | 2683 | output += 8; |
michael@0 | 2684 | } |
michael@0 | 2685 | } |
michael@0 | 2686 | } |
michael@0 | 2687 | } |
michael@0 | 2688 | } // NOLINT |