|
1 /* |
|
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved. |
|
3 * |
|
4 * Use of this source code is governed by a BSD-style license |
|
5 * that can be found in the LICENSE file in the root of the source |
|
6 * tree. An additional intellectual property rights grant can be found |
|
7 * in the file PATENTS. All contributing project authors may |
|
8 * be found in the AUTHORS file in the root of the source tree. |
|
9 */ |
|
10 |
|
11 #include <assert.h> |
|
12 #include <emmintrin.h> // SSE2 |
|
13 #include "./vpx_config.h" |
|
14 #include "vpx/vpx_integer.h" |
|
15 #include "vp9/common/vp9_common.h" |
|
16 #include "vp9/common/vp9_idct.h" |
|
17 |
|
18 #define RECON_AND_STORE4X4(dest, in_x) \ |
|
19 { \ |
|
20 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \ |
|
21 d0 = _mm_unpacklo_epi8(d0, zero); \ |
|
22 d0 = _mm_add_epi16(in_x, d0); \ |
|
23 d0 = _mm_packus_epi16(d0, d0); \ |
|
24 *(int *)dest = _mm_cvtsi128_si32(d0); \ |
|
25 dest += stride; \ |
|
26 } |
|
27 |
|
28 void vp9_idct4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
|
29 const __m128i zero = _mm_setzero_si128(); |
|
30 const __m128i eight = _mm_set1_epi16(8); |
|
31 const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, |
|
32 (int16_t)cospi_16_64, (int16_t)-cospi_16_64, |
|
33 (int16_t)cospi_24_64, (int16_t)-cospi_8_64, |
|
34 (int16_t)cospi_8_64, (int16_t)cospi_24_64); |
|
35 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
|
36 __m128i input0, input1, input2, input3; |
|
37 |
|
38 // Rows |
|
39 input0 = _mm_load_si128((const __m128i *)input); |
|
40 input2 = _mm_load_si128((const __m128i *)(input + 8)); |
|
41 |
|
42 // Construct i3, i1, i3, i1, i2, i0, i2, i0 |
|
43 input0 = _mm_shufflelo_epi16(input0, 0xd8); |
|
44 input0 = _mm_shufflehi_epi16(input0, 0xd8); |
|
45 input2 = _mm_shufflelo_epi16(input2, 0xd8); |
|
46 input2 = _mm_shufflehi_epi16(input2, 0xd8); |
|
47 |
|
48 input1 = _mm_unpackhi_epi32(input0, input0); |
|
49 input0 = _mm_unpacklo_epi32(input0, input0); |
|
50 input3 = _mm_unpackhi_epi32(input2, input2); |
|
51 input2 = _mm_unpacklo_epi32(input2, input2); |
|
52 |
|
53 // Stage 1 |
|
54 input0 = _mm_madd_epi16(input0, cst); |
|
55 input1 = _mm_madd_epi16(input1, cst); |
|
56 input2 = _mm_madd_epi16(input2, cst); |
|
57 input3 = _mm_madd_epi16(input3, cst); |
|
58 |
|
59 input0 = _mm_add_epi32(input0, rounding); |
|
60 input1 = _mm_add_epi32(input1, rounding); |
|
61 input2 = _mm_add_epi32(input2, rounding); |
|
62 input3 = _mm_add_epi32(input3, rounding); |
|
63 |
|
64 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); |
|
65 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); |
|
66 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); |
|
67 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); |
|
68 |
|
69 // Stage 2 |
|
70 input0 = _mm_packs_epi32(input0, input1); |
|
71 input1 = _mm_packs_epi32(input2, input3); |
|
72 |
|
73 // Transpose |
|
74 input2 = _mm_unpacklo_epi16(input0, input1); |
|
75 input3 = _mm_unpackhi_epi16(input0, input1); |
|
76 input0 = _mm_unpacklo_epi32(input2, input3); |
|
77 input1 = _mm_unpackhi_epi32(input2, input3); |
|
78 |
|
79 // Switch column2, column 3, and then, we got: |
|
80 // input2: column1, column 0; input3: column2, column 3. |
|
81 input1 = _mm_shuffle_epi32(input1, 0x4e); |
|
82 input2 = _mm_add_epi16(input0, input1); |
|
83 input3 = _mm_sub_epi16(input0, input1); |
|
84 |
|
85 // Columns |
|
86 // Construct i3, i1, i3, i1, i2, i0, i2, i0 |
|
87 input0 = _mm_unpacklo_epi32(input2, input2); |
|
88 input1 = _mm_unpackhi_epi32(input2, input2); |
|
89 input2 = _mm_unpackhi_epi32(input3, input3); |
|
90 input3 = _mm_unpacklo_epi32(input3, input3); |
|
91 |
|
92 // Stage 1 |
|
93 input0 = _mm_madd_epi16(input0, cst); |
|
94 input1 = _mm_madd_epi16(input1, cst); |
|
95 input2 = _mm_madd_epi16(input2, cst); |
|
96 input3 = _mm_madd_epi16(input3, cst); |
|
97 |
|
98 input0 = _mm_add_epi32(input0, rounding); |
|
99 input1 = _mm_add_epi32(input1, rounding); |
|
100 input2 = _mm_add_epi32(input2, rounding); |
|
101 input3 = _mm_add_epi32(input3, rounding); |
|
102 |
|
103 input0 = _mm_srai_epi32(input0, DCT_CONST_BITS); |
|
104 input1 = _mm_srai_epi32(input1, DCT_CONST_BITS); |
|
105 input2 = _mm_srai_epi32(input2, DCT_CONST_BITS); |
|
106 input3 = _mm_srai_epi32(input3, DCT_CONST_BITS); |
|
107 |
|
108 // Stage 2 |
|
109 input0 = _mm_packs_epi32(input0, input2); |
|
110 input1 = _mm_packs_epi32(input1, input3); |
|
111 |
|
112 // Transpose |
|
113 input2 = _mm_unpacklo_epi16(input0, input1); |
|
114 input3 = _mm_unpackhi_epi16(input0, input1); |
|
115 input0 = _mm_unpacklo_epi32(input2, input3); |
|
116 input1 = _mm_unpackhi_epi32(input2, input3); |
|
117 |
|
118 // Switch column2, column 3, and then, we got: |
|
119 // input2: column1, column 0; input3: column2, column 3. |
|
120 input1 = _mm_shuffle_epi32(input1, 0x4e); |
|
121 input2 = _mm_add_epi16(input0, input1); |
|
122 input3 = _mm_sub_epi16(input0, input1); |
|
123 |
|
124 // Final round and shift |
|
125 input2 = _mm_add_epi16(input2, eight); |
|
126 input3 = _mm_add_epi16(input3, eight); |
|
127 |
|
128 input2 = _mm_srai_epi16(input2, 4); |
|
129 input3 = _mm_srai_epi16(input3, 4); |
|
130 |
|
131 // Reconstruction and Store |
|
132 { |
|
133 __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); |
|
134 __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); |
|
135 d0 = _mm_unpacklo_epi32(d0, |
|
136 _mm_cvtsi32_si128(*(const int *) (dest + stride))); |
|
137 d2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128( |
|
138 *(const int *) (dest + stride * 3)), d2); |
|
139 d0 = _mm_unpacklo_epi8(d0, zero); |
|
140 d2 = _mm_unpacklo_epi8(d2, zero); |
|
141 d0 = _mm_add_epi16(d0, input2); |
|
142 d2 = _mm_add_epi16(d2, input3); |
|
143 d0 = _mm_packus_epi16(d0, d2); |
|
144 // store input0 |
|
145 *(int *)dest = _mm_cvtsi128_si32(d0); |
|
146 // store input1 |
|
147 d0 = _mm_srli_si128(d0, 4); |
|
148 *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); |
|
149 // store input2 |
|
150 d0 = _mm_srli_si128(d0, 4); |
|
151 *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); |
|
152 // store input3 |
|
153 d0 = _mm_srli_si128(d0, 4); |
|
154 *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); |
|
155 } |
|
156 } |
|
157 |
|
158 void vp9_idct4x4_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
|
159 __m128i dc_value; |
|
160 const __m128i zero = _mm_setzero_si128(); |
|
161 int a; |
|
162 |
|
163 a = dct_const_round_shift(input[0] * cospi_16_64); |
|
164 a = dct_const_round_shift(a * cospi_16_64); |
|
165 a = ROUND_POWER_OF_TWO(a, 4); |
|
166 |
|
167 dc_value = _mm_set1_epi16(a); |
|
168 |
|
169 RECON_AND_STORE4X4(dest, dc_value); |
|
170 RECON_AND_STORE4X4(dest, dc_value); |
|
171 RECON_AND_STORE4X4(dest, dc_value); |
|
172 RECON_AND_STORE4X4(dest, dc_value); |
|
173 } |
|
174 |
|
175 static INLINE void transpose_4x4(__m128i *res) { |
|
176 const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); |
|
177 const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]); |
|
178 res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); |
|
179 res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); |
|
180 |
|
181 res[1] = _mm_unpackhi_epi64(res[0], res[0]); |
|
182 res[3] = _mm_unpackhi_epi64(res[2], res[2]); |
|
183 } |
|
184 |
|
185 static void idct4_1d_sse2(__m128i *in) { |
|
186 const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64); |
|
187 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
|
188 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
|
189 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
|
190 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
|
191 __m128i u[8], v[8]; |
|
192 |
|
193 transpose_4x4(in); |
|
194 // stage 1 |
|
195 u[0] = _mm_unpacklo_epi16(in[0], in[2]); |
|
196 u[1] = _mm_unpacklo_epi16(in[1], in[3]); |
|
197 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); |
|
198 v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16); |
|
199 v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08); |
|
200 v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24); |
|
201 |
|
202 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); |
|
203 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); |
|
204 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); |
|
205 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); |
|
206 |
|
207 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
|
208 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
|
209 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
|
210 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
|
211 |
|
212 u[0] = _mm_packs_epi32(v[0], v[2]); |
|
213 u[1] = _mm_packs_epi32(v[1], v[3]); |
|
214 u[2] = _mm_unpackhi_epi64(u[0], u[0]); |
|
215 u[3] = _mm_unpackhi_epi64(u[1], u[1]); |
|
216 |
|
217 // stage 2 |
|
218 in[0] = _mm_add_epi16(u[0], u[3]); |
|
219 in[1] = _mm_add_epi16(u[1], u[2]); |
|
220 in[2] = _mm_sub_epi16(u[1], u[2]); |
|
221 in[3] = _mm_sub_epi16(u[0], u[3]); |
|
222 } |
|
223 |
|
224 static void iadst4_1d_sse2(__m128i *in) { |
|
225 const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9); |
|
226 const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9); |
|
227 const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9); |
|
228 const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9); |
|
229 const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); |
|
230 const __m128i kZero = _mm_set1_epi16(0); |
|
231 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
|
232 __m128i u[8], v[8], in7; |
|
233 |
|
234 transpose_4x4(in); |
|
235 in7 = _mm_add_epi16(in[0], in[3]); |
|
236 in7 = _mm_sub_epi16(in7, in[2]); |
|
237 |
|
238 u[0] = _mm_unpacklo_epi16(in[0], in[2]); |
|
239 u[1] = _mm_unpacklo_epi16(in[1], in[3]); |
|
240 u[2] = _mm_unpacklo_epi16(in7, kZero); |
|
241 u[3] = _mm_unpacklo_epi16(in[1], kZero); |
|
242 |
|
243 v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3 |
|
244 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5 |
|
245 v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2 |
|
246 v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4 |
|
247 v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6 |
|
248 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2 |
|
249 |
|
250 u[0] = _mm_add_epi32(v[0], v[1]); |
|
251 u[1] = _mm_add_epi32(v[3], v[4]); |
|
252 u[2] = v[2]; |
|
253 u[3] = _mm_add_epi32(u[0], u[1]); |
|
254 u[4] = _mm_slli_epi32(v[5], 2); |
|
255 u[5] = _mm_add_epi32(u[3], v[5]); |
|
256 u[6] = _mm_sub_epi32(u[5], u[4]); |
|
257 |
|
258 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); |
|
259 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); |
|
260 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); |
|
261 v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); |
|
262 |
|
263 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
|
264 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
|
265 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
|
266 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
|
267 |
|
268 in[0] = _mm_packs_epi32(u[0], u[2]); |
|
269 in[1] = _mm_packs_epi32(u[1], u[3]); |
|
270 in[2] = _mm_unpackhi_epi64(in[0], in[0]); |
|
271 in[3] = _mm_unpackhi_epi64(in[1], in[1]); |
|
272 } |
|
273 |
|
274 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
|
275 int tx_type) { |
|
276 __m128i in[4]; |
|
277 const __m128i zero = _mm_setzero_si128(); |
|
278 const __m128i eight = _mm_set1_epi16(8); |
|
279 |
|
280 in[0] = _mm_loadl_epi64((const __m128i *)input); |
|
281 in[1] = _mm_loadl_epi64((const __m128i *)(input + 4)); |
|
282 in[2] = _mm_loadl_epi64((const __m128i *)(input + 8)); |
|
283 in[3] = _mm_loadl_epi64((const __m128i *)(input + 12)); |
|
284 |
|
285 switch (tx_type) { |
|
286 case 0: // DCT_DCT |
|
287 idct4_1d_sse2(in); |
|
288 idct4_1d_sse2(in); |
|
289 break; |
|
290 case 1: // ADST_DCT |
|
291 idct4_1d_sse2(in); |
|
292 iadst4_1d_sse2(in); |
|
293 break; |
|
294 case 2: // DCT_ADST |
|
295 iadst4_1d_sse2(in); |
|
296 idct4_1d_sse2(in); |
|
297 break; |
|
298 case 3: // ADST_ADST |
|
299 iadst4_1d_sse2(in); |
|
300 iadst4_1d_sse2(in); |
|
301 break; |
|
302 default: |
|
303 assert(0); |
|
304 break; |
|
305 } |
|
306 |
|
307 // Final round and shift |
|
308 in[0] = _mm_add_epi16(in[0], eight); |
|
309 in[1] = _mm_add_epi16(in[1], eight); |
|
310 in[2] = _mm_add_epi16(in[2], eight); |
|
311 in[3] = _mm_add_epi16(in[3], eight); |
|
312 |
|
313 in[0] = _mm_srai_epi16(in[0], 4); |
|
314 in[1] = _mm_srai_epi16(in[1], 4); |
|
315 in[2] = _mm_srai_epi16(in[2], 4); |
|
316 in[3] = _mm_srai_epi16(in[3], 4); |
|
317 |
|
318 RECON_AND_STORE4X4(dest, in[0]); |
|
319 RECON_AND_STORE4X4(dest, in[1]); |
|
320 RECON_AND_STORE4X4(dest, in[2]); |
|
321 RECON_AND_STORE4X4(dest, in[3]); |
|
322 } |
|
323 |
|
324 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \ |
|
325 out0, out1, out2, out3, out4, out5, out6, out7) \ |
|
326 { \ |
|
327 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ |
|
328 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ |
|
329 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ |
|
330 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ |
|
331 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ |
|
332 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ |
|
333 const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ |
|
334 const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ |
|
335 \ |
|
336 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ |
|
337 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ |
|
338 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ |
|
339 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ |
|
340 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ |
|
341 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ |
|
342 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ |
|
343 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ |
|
344 \ |
|
345 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ |
|
346 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ |
|
347 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ |
|
348 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ |
|
349 out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ |
|
350 out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ |
|
351 out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ |
|
352 out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ |
|
353 } |
|
354 |
|
355 #define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \ |
|
356 out0, out1, out2, out3, out4, out5, out6, out7) \ |
|
357 { \ |
|
358 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ |
|
359 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ |
|
360 const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ |
|
361 const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ |
|
362 \ |
|
363 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ |
|
364 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ |
|
365 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ |
|
366 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ |
|
367 \ |
|
368 out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ |
|
369 out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ |
|
370 out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ |
|
371 out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ |
|
372 out4 = out5 = out6 = out7 = zero; \ |
|
373 } |
|
374 |
|
375 #define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \ |
|
376 { \ |
|
377 const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ |
|
378 const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ |
|
379 const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ |
|
380 const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ |
|
381 \ |
|
382 in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \ |
|
383 in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \ |
|
384 in2 = _mm_unpacklo_epi32(tr0_2, tr0_3); /* i5 i4 */ \ |
|
385 in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \ |
|
386 } |
|
387 |
|
388 // Define Macro for multiplying elements by constants and adding them together. |
|
389 #define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \ |
|
390 cst0, cst1, cst2, cst3, res0, res1, res2, res3) \ |
|
391 { \ |
|
392 tmp0 = _mm_madd_epi16(lo_0, cst0); \ |
|
393 tmp1 = _mm_madd_epi16(hi_0, cst0); \ |
|
394 tmp2 = _mm_madd_epi16(lo_0, cst1); \ |
|
395 tmp3 = _mm_madd_epi16(hi_0, cst1); \ |
|
396 tmp4 = _mm_madd_epi16(lo_1, cst2); \ |
|
397 tmp5 = _mm_madd_epi16(hi_1, cst2); \ |
|
398 tmp6 = _mm_madd_epi16(lo_1, cst3); \ |
|
399 tmp7 = _mm_madd_epi16(hi_1, cst3); \ |
|
400 \ |
|
401 tmp0 = _mm_add_epi32(tmp0, rounding); \ |
|
402 tmp1 = _mm_add_epi32(tmp1, rounding); \ |
|
403 tmp2 = _mm_add_epi32(tmp2, rounding); \ |
|
404 tmp3 = _mm_add_epi32(tmp3, rounding); \ |
|
405 tmp4 = _mm_add_epi32(tmp4, rounding); \ |
|
406 tmp5 = _mm_add_epi32(tmp5, rounding); \ |
|
407 tmp6 = _mm_add_epi32(tmp6, rounding); \ |
|
408 tmp7 = _mm_add_epi32(tmp7, rounding); \ |
|
409 \ |
|
410 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ |
|
411 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ |
|
412 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ |
|
413 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ |
|
414 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ |
|
415 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ |
|
416 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ |
|
417 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ |
|
418 \ |
|
419 res0 = _mm_packs_epi32(tmp0, tmp1); \ |
|
420 res1 = _mm_packs_epi32(tmp2, tmp3); \ |
|
421 res2 = _mm_packs_epi32(tmp4, tmp5); \ |
|
422 res3 = _mm_packs_epi32(tmp6, tmp7); \ |
|
423 } |
|
424 |
|
425 #define IDCT8_1D \ |
|
426 /* Stage1 */ \ |
|
427 { \ |
|
428 const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \ |
|
429 const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \ |
|
430 const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \ |
|
431 const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \ |
|
432 \ |
|
433 MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \ |
|
434 stg1_1, stg1_2, stg1_3, stp1_4, \ |
|
435 stp1_7, stp1_5, stp1_6) \ |
|
436 } \ |
|
437 \ |
|
438 /* Stage2 */ \ |
|
439 { \ |
|
440 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \ |
|
441 const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \ |
|
442 const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \ |
|
443 const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \ |
|
444 \ |
|
445 MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \ |
|
446 stg2_1, stg2_2, stg2_3, stp2_0, \ |
|
447 stp2_1, stp2_2, stp2_3) \ |
|
448 \ |
|
449 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ |
|
450 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ |
|
451 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ |
|
452 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ |
|
453 } \ |
|
454 \ |
|
455 /* Stage3 */ \ |
|
456 { \ |
|
457 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ |
|
458 const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ |
|
459 \ |
|
460 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ |
|
461 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ |
|
462 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ |
|
463 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ |
|
464 \ |
|
465 tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ |
|
466 tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ |
|
467 tmp2 = _mm_madd_epi16(lo_56, stg2_0); \ |
|
468 tmp3 = _mm_madd_epi16(hi_56, stg2_0); \ |
|
469 \ |
|
470 tmp0 = _mm_add_epi32(tmp0, rounding); \ |
|
471 tmp1 = _mm_add_epi32(tmp1, rounding); \ |
|
472 tmp2 = _mm_add_epi32(tmp2, rounding); \ |
|
473 tmp3 = _mm_add_epi32(tmp3, rounding); \ |
|
474 \ |
|
475 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ |
|
476 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ |
|
477 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ |
|
478 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ |
|
479 \ |
|
480 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ |
|
481 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ |
|
482 } \ |
|
483 \ |
|
484 /* Stage4 */ \ |
|
485 in0 = _mm_adds_epi16(stp1_0, stp2_7); \ |
|
486 in1 = _mm_adds_epi16(stp1_1, stp1_6); \ |
|
487 in2 = _mm_adds_epi16(stp1_2, stp1_5); \ |
|
488 in3 = _mm_adds_epi16(stp1_3, stp2_4); \ |
|
489 in4 = _mm_subs_epi16(stp1_3, stp2_4); \ |
|
490 in5 = _mm_subs_epi16(stp1_2, stp1_5); \ |
|
491 in6 = _mm_subs_epi16(stp1_1, stp1_6); \ |
|
492 in7 = _mm_subs_epi16(stp1_0, stp2_7); |
|
493 |
|
494 #define RECON_AND_STORE(dest, in_x) \ |
|
495 { \ |
|
496 __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \ |
|
497 d0 = _mm_unpacklo_epi8(d0, zero); \ |
|
498 d0 = _mm_add_epi16(in_x, d0); \ |
|
499 d0 = _mm_packus_epi16(d0, d0); \ |
|
500 _mm_storel_epi64((__m128i *)(dest), d0); \ |
|
501 dest += stride; \ |
|
502 } |
|
503 |
|
504 void vp9_idct8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
|
505 const __m128i zero = _mm_setzero_si128(); |
|
506 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
|
507 const __m128i final_rounding = _mm_set1_epi16(1<<4); |
|
508 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
|
509 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
|
510 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
|
511 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
|
512 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
|
513 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
|
514 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
|
515 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
|
516 |
|
517 __m128i in0, in1, in2, in3, in4, in5, in6, in7; |
|
518 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; |
|
519 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; |
|
520 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
|
521 int i; |
|
522 |
|
523 // Load input data. |
|
524 in0 = _mm_load_si128((const __m128i *)input); |
|
525 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
|
526 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
|
527 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
|
528 in4 = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
|
529 in5 = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
|
530 in6 = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
|
531 in7 = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
|
532 |
|
533 // 2-D |
|
534 for (i = 0; i < 2; i++) { |
|
535 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() |
|
536 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, |
|
537 in4, in5, in6, in7); |
|
538 |
|
539 // 4-stage 1D idct8x8 |
|
540 IDCT8_1D |
|
541 } |
|
542 |
|
543 // Final rounding and shift |
|
544 in0 = _mm_adds_epi16(in0, final_rounding); |
|
545 in1 = _mm_adds_epi16(in1, final_rounding); |
|
546 in2 = _mm_adds_epi16(in2, final_rounding); |
|
547 in3 = _mm_adds_epi16(in3, final_rounding); |
|
548 in4 = _mm_adds_epi16(in4, final_rounding); |
|
549 in5 = _mm_adds_epi16(in5, final_rounding); |
|
550 in6 = _mm_adds_epi16(in6, final_rounding); |
|
551 in7 = _mm_adds_epi16(in7, final_rounding); |
|
552 |
|
553 in0 = _mm_srai_epi16(in0, 5); |
|
554 in1 = _mm_srai_epi16(in1, 5); |
|
555 in2 = _mm_srai_epi16(in2, 5); |
|
556 in3 = _mm_srai_epi16(in3, 5); |
|
557 in4 = _mm_srai_epi16(in4, 5); |
|
558 in5 = _mm_srai_epi16(in5, 5); |
|
559 in6 = _mm_srai_epi16(in6, 5); |
|
560 in7 = _mm_srai_epi16(in7, 5); |
|
561 |
|
562 RECON_AND_STORE(dest, in0); |
|
563 RECON_AND_STORE(dest, in1); |
|
564 RECON_AND_STORE(dest, in2); |
|
565 RECON_AND_STORE(dest, in3); |
|
566 RECON_AND_STORE(dest, in4); |
|
567 RECON_AND_STORE(dest, in5); |
|
568 RECON_AND_STORE(dest, in6); |
|
569 RECON_AND_STORE(dest, in7); |
|
570 } |
|
571 |
|
572 void vp9_idct8x8_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
|
573 __m128i dc_value; |
|
574 const __m128i zero = _mm_setzero_si128(); |
|
575 int a; |
|
576 |
|
577 a = dct_const_round_shift(input[0] * cospi_16_64); |
|
578 a = dct_const_round_shift(a * cospi_16_64); |
|
579 a = ROUND_POWER_OF_TWO(a, 5); |
|
580 |
|
581 dc_value = _mm_set1_epi16(a); |
|
582 |
|
583 RECON_AND_STORE(dest, dc_value); |
|
584 RECON_AND_STORE(dest, dc_value); |
|
585 RECON_AND_STORE(dest, dc_value); |
|
586 RECON_AND_STORE(dest, dc_value); |
|
587 RECON_AND_STORE(dest, dc_value); |
|
588 RECON_AND_STORE(dest, dc_value); |
|
589 RECON_AND_STORE(dest, dc_value); |
|
590 RECON_AND_STORE(dest, dc_value); |
|
591 } |
|
592 |
|
593 // perform 8x8 transpose |
|
594 static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { |
|
595 const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); |
|
596 const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); |
|
597 const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); |
|
598 const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); |
|
599 const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); |
|
600 const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); |
|
601 const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); |
|
602 const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); |
|
603 |
|
604 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); |
|
605 const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); |
|
606 const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); |
|
607 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); |
|
608 const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); |
|
609 const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); |
|
610 const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); |
|
611 const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); |
|
612 |
|
613 res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); |
|
614 res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); |
|
615 res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); |
|
616 res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); |
|
617 res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); |
|
618 res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); |
|
619 res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); |
|
620 res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); |
|
621 } |
|
622 |
|
623 static void idct8_1d_sse2(__m128i *in) { |
|
624 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
|
625 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
|
626 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
|
627 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
|
628 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
|
629 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
|
630 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
|
631 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
|
632 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
|
633 |
|
634 __m128i in0, in1, in2, in3, in4, in5, in6, in7; |
|
635 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; |
|
636 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; |
|
637 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
|
638 |
|
639 in0 = in[0]; |
|
640 in1 = in[1]; |
|
641 in2 = in[2]; |
|
642 in3 = in[3]; |
|
643 in4 = in[4]; |
|
644 in5 = in[5]; |
|
645 in6 = in[6]; |
|
646 in7 = in[7]; |
|
647 |
|
648 // 8x8 Transpose is copied from vp9_fdct8x8_sse2() |
|
649 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, |
|
650 in4, in5, in6, in7); |
|
651 |
|
652 // 4-stage 1D idct8x8 |
|
653 IDCT8_1D |
|
654 in[0] = in0; |
|
655 in[1] = in1; |
|
656 in[2] = in2; |
|
657 in[3] = in3; |
|
658 in[4] = in4; |
|
659 in[5] = in5; |
|
660 in[6] = in6; |
|
661 in[7] = in7; |
|
662 } |
|
663 |
|
664 static void iadst8_1d_sse2(__m128i *in) { |
|
665 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); |
|
666 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
|
667 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); |
|
668 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
|
669 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); |
|
670 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
|
671 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); |
|
672 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
|
673 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
|
674 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
|
675 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); |
|
676 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
|
677 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
|
678 const __m128i k__const_0 = _mm_set1_epi16(0); |
|
679 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
|
680 |
|
681 __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; |
|
682 __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; |
|
683 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; |
|
684 __m128i s0, s1, s2, s3, s4, s5, s6, s7; |
|
685 __m128i in0, in1, in2, in3, in4, in5, in6, in7; |
|
686 |
|
687 // transpose |
|
688 array_transpose_8x8(in, in); |
|
689 |
|
690 // properly aligned for butterfly input |
|
691 in0 = in[7]; |
|
692 in1 = in[0]; |
|
693 in2 = in[5]; |
|
694 in3 = in[2]; |
|
695 in4 = in[3]; |
|
696 in5 = in[4]; |
|
697 in6 = in[1]; |
|
698 in7 = in[6]; |
|
699 |
|
700 // column transformation |
|
701 // stage 1 |
|
702 // interleave and multiply/add into 32-bit integer |
|
703 s0 = _mm_unpacklo_epi16(in0, in1); |
|
704 s1 = _mm_unpackhi_epi16(in0, in1); |
|
705 s2 = _mm_unpacklo_epi16(in2, in3); |
|
706 s3 = _mm_unpackhi_epi16(in2, in3); |
|
707 s4 = _mm_unpacklo_epi16(in4, in5); |
|
708 s5 = _mm_unpackhi_epi16(in4, in5); |
|
709 s6 = _mm_unpacklo_epi16(in6, in7); |
|
710 s7 = _mm_unpackhi_epi16(in6, in7); |
|
711 |
|
712 u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); |
|
713 u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); |
|
714 u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); |
|
715 u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); |
|
716 u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); |
|
717 u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); |
|
718 u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); |
|
719 u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); |
|
720 u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); |
|
721 u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); |
|
722 u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); |
|
723 u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); |
|
724 u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); |
|
725 u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); |
|
726 u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); |
|
727 u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); |
|
728 |
|
729 // addition |
|
730 w0 = _mm_add_epi32(u0, u8); |
|
731 w1 = _mm_add_epi32(u1, u9); |
|
732 w2 = _mm_add_epi32(u2, u10); |
|
733 w3 = _mm_add_epi32(u3, u11); |
|
734 w4 = _mm_add_epi32(u4, u12); |
|
735 w5 = _mm_add_epi32(u5, u13); |
|
736 w6 = _mm_add_epi32(u6, u14); |
|
737 w7 = _mm_add_epi32(u7, u15); |
|
738 w8 = _mm_sub_epi32(u0, u8); |
|
739 w9 = _mm_sub_epi32(u1, u9); |
|
740 w10 = _mm_sub_epi32(u2, u10); |
|
741 w11 = _mm_sub_epi32(u3, u11); |
|
742 w12 = _mm_sub_epi32(u4, u12); |
|
743 w13 = _mm_sub_epi32(u5, u13); |
|
744 w14 = _mm_sub_epi32(u6, u14); |
|
745 w15 = _mm_sub_epi32(u7, u15); |
|
746 |
|
747 // shift and rounding |
|
748 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); |
|
749 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); |
|
750 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); |
|
751 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); |
|
752 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); |
|
753 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); |
|
754 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); |
|
755 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); |
|
756 v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); |
|
757 v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); |
|
758 v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); |
|
759 v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); |
|
760 v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); |
|
761 v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); |
|
762 v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); |
|
763 v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); |
|
764 |
|
765 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); |
|
766 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); |
|
767 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); |
|
768 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); |
|
769 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); |
|
770 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); |
|
771 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); |
|
772 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); |
|
773 u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); |
|
774 u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); |
|
775 u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); |
|
776 u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); |
|
777 u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); |
|
778 u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); |
|
779 u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); |
|
780 u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); |
|
781 |
|
782 // back to 16-bit and pack 8 integers into __m128i |
|
783 in[0] = _mm_packs_epi32(u0, u1); |
|
784 in[1] = _mm_packs_epi32(u2, u3); |
|
785 in[2] = _mm_packs_epi32(u4, u5); |
|
786 in[3] = _mm_packs_epi32(u6, u7); |
|
787 in[4] = _mm_packs_epi32(u8, u9); |
|
788 in[5] = _mm_packs_epi32(u10, u11); |
|
789 in[6] = _mm_packs_epi32(u12, u13); |
|
790 in[7] = _mm_packs_epi32(u14, u15); |
|
791 |
|
792 // stage 2 |
|
793 s0 = _mm_add_epi16(in[0], in[2]); |
|
794 s1 = _mm_add_epi16(in[1], in[3]); |
|
795 s2 = _mm_sub_epi16(in[0], in[2]); |
|
796 s3 = _mm_sub_epi16(in[1], in[3]); |
|
797 u0 = _mm_unpacklo_epi16(in[4], in[5]); |
|
798 u1 = _mm_unpackhi_epi16(in[4], in[5]); |
|
799 u2 = _mm_unpacklo_epi16(in[6], in[7]); |
|
800 u3 = _mm_unpackhi_epi16(in[6], in[7]); |
|
801 |
|
802 v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); |
|
803 v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); |
|
804 v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); |
|
805 v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); |
|
806 v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); |
|
807 v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); |
|
808 v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); |
|
809 v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); |
|
810 |
|
811 w0 = _mm_add_epi32(v0, v4); |
|
812 w1 = _mm_add_epi32(v1, v5); |
|
813 w2 = _mm_add_epi32(v2, v6); |
|
814 w3 = _mm_add_epi32(v3, v7); |
|
815 w4 = _mm_sub_epi32(v0, v4); |
|
816 w5 = _mm_sub_epi32(v1, v5); |
|
817 w6 = _mm_sub_epi32(v2, v6); |
|
818 w7 = _mm_sub_epi32(v3, v7); |
|
819 |
|
820 v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); |
|
821 v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); |
|
822 v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); |
|
823 v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); |
|
824 v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); |
|
825 v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); |
|
826 v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); |
|
827 v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); |
|
828 |
|
829 u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); |
|
830 u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); |
|
831 u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); |
|
832 u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); |
|
833 u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); |
|
834 u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); |
|
835 u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); |
|
836 u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); |
|
837 |
|
838 // back to 16-bit intergers |
|
839 s4 = _mm_packs_epi32(u0, u1); |
|
840 s5 = _mm_packs_epi32(u2, u3); |
|
841 s6 = _mm_packs_epi32(u4, u5); |
|
842 s7 = _mm_packs_epi32(u6, u7); |
|
843 |
|
844 // stage 3 |
|
845 u0 = _mm_unpacklo_epi16(s2, s3); |
|
846 u1 = _mm_unpackhi_epi16(s2, s3); |
|
847 u2 = _mm_unpacklo_epi16(s6, s7); |
|
848 u3 = _mm_unpackhi_epi16(s6, s7); |
|
849 |
|
850 v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); |
|
851 v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); |
|
852 v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); |
|
853 v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); |
|
854 v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); |
|
855 v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); |
|
856 v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); |
|
857 v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); |
|
858 |
|
859 u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); |
|
860 u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); |
|
861 u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); |
|
862 u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); |
|
863 u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); |
|
864 u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); |
|
865 u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); |
|
866 u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); |
|
867 |
|
868 v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); |
|
869 v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); |
|
870 v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); |
|
871 v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); |
|
872 v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); |
|
873 v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); |
|
874 v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); |
|
875 v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); |
|
876 |
|
877 s2 = _mm_packs_epi32(v0, v1); |
|
878 s3 = _mm_packs_epi32(v2, v3); |
|
879 s6 = _mm_packs_epi32(v4, v5); |
|
880 s7 = _mm_packs_epi32(v6, v7); |
|
881 |
|
882 in[0] = s0; |
|
883 in[1] = _mm_sub_epi16(k__const_0, s4); |
|
884 in[2] = s6; |
|
885 in[3] = _mm_sub_epi16(k__const_0, s2); |
|
886 in[4] = s3; |
|
887 in[5] = _mm_sub_epi16(k__const_0, s7); |
|
888 in[6] = s5; |
|
889 in[7] = _mm_sub_epi16(k__const_0, s1); |
|
890 } |
|
891 |
|
892 |
|
893 void vp9_iht8x8_64_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
|
894 int tx_type) { |
|
895 __m128i in[8]; |
|
896 const __m128i zero = _mm_setzero_si128(); |
|
897 const __m128i final_rounding = _mm_set1_epi16(1<<4); |
|
898 |
|
899 // load input data |
|
900 in[0] = _mm_load_si128((const __m128i *)input); |
|
901 in[1] = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
|
902 in[2] = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
|
903 in[3] = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
|
904 in[4] = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
|
905 in[5] = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
|
906 in[6] = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
|
907 in[7] = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
|
908 |
|
909 switch (tx_type) { |
|
910 case 0: // DCT_DCT |
|
911 idct8_1d_sse2(in); |
|
912 idct8_1d_sse2(in); |
|
913 break; |
|
914 case 1: // ADST_DCT |
|
915 idct8_1d_sse2(in); |
|
916 iadst8_1d_sse2(in); |
|
917 break; |
|
918 case 2: // DCT_ADST |
|
919 iadst8_1d_sse2(in); |
|
920 idct8_1d_sse2(in); |
|
921 break; |
|
922 case 3: // ADST_ADST |
|
923 iadst8_1d_sse2(in); |
|
924 iadst8_1d_sse2(in); |
|
925 break; |
|
926 default: |
|
927 assert(0); |
|
928 break; |
|
929 } |
|
930 |
|
931 // Final rounding and shift |
|
932 in[0] = _mm_adds_epi16(in[0], final_rounding); |
|
933 in[1] = _mm_adds_epi16(in[1], final_rounding); |
|
934 in[2] = _mm_adds_epi16(in[2], final_rounding); |
|
935 in[3] = _mm_adds_epi16(in[3], final_rounding); |
|
936 in[4] = _mm_adds_epi16(in[4], final_rounding); |
|
937 in[5] = _mm_adds_epi16(in[5], final_rounding); |
|
938 in[6] = _mm_adds_epi16(in[6], final_rounding); |
|
939 in[7] = _mm_adds_epi16(in[7], final_rounding); |
|
940 |
|
941 in[0] = _mm_srai_epi16(in[0], 5); |
|
942 in[1] = _mm_srai_epi16(in[1], 5); |
|
943 in[2] = _mm_srai_epi16(in[2], 5); |
|
944 in[3] = _mm_srai_epi16(in[3], 5); |
|
945 in[4] = _mm_srai_epi16(in[4], 5); |
|
946 in[5] = _mm_srai_epi16(in[5], 5); |
|
947 in[6] = _mm_srai_epi16(in[6], 5); |
|
948 in[7] = _mm_srai_epi16(in[7], 5); |
|
949 |
|
950 RECON_AND_STORE(dest, in[0]); |
|
951 RECON_AND_STORE(dest, in[1]); |
|
952 RECON_AND_STORE(dest, in[2]); |
|
953 RECON_AND_STORE(dest, in[3]); |
|
954 RECON_AND_STORE(dest, in[4]); |
|
955 RECON_AND_STORE(dest, in[5]); |
|
956 RECON_AND_STORE(dest, in[6]); |
|
957 RECON_AND_STORE(dest, in[7]); |
|
958 } |
|
959 |
|
960 void vp9_idct8x8_10_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
|
961 const __m128i zero = _mm_setzero_si128(); |
|
962 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
|
963 const __m128i final_rounding = _mm_set1_epi16(1<<4); |
|
964 const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
|
965 const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
|
966 const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
|
967 const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); |
|
968 const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
|
969 const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
|
970 const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
|
971 const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
|
972 const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
|
973 |
|
974 __m128i in0, in1, in2, in3, in4, in5, in6, in7; |
|
975 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; |
|
976 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; |
|
977 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
|
978 |
|
979 // Rows. Load 4-row input data. |
|
980 in0 = _mm_load_si128((const __m128i *)input); |
|
981 in1 = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
|
982 in2 = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
|
983 in3 = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
|
984 |
|
985 // 8x4 Transpose |
|
986 TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3) |
|
987 |
|
988 // Stage1 |
|
989 { //NOLINT |
|
990 const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3); |
|
991 const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2); |
|
992 |
|
993 tmp0 = _mm_madd_epi16(lo_17, stg1_0); |
|
994 tmp2 = _mm_madd_epi16(lo_17, stg1_1); |
|
995 tmp4 = _mm_madd_epi16(lo_35, stg1_2); |
|
996 tmp6 = _mm_madd_epi16(lo_35, stg1_3); |
|
997 |
|
998 tmp0 = _mm_add_epi32(tmp0, rounding); |
|
999 tmp2 = _mm_add_epi32(tmp2, rounding); |
|
1000 tmp4 = _mm_add_epi32(tmp4, rounding); |
|
1001 tmp6 = _mm_add_epi32(tmp6, rounding); |
|
1002 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
|
1003 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
|
1004 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); |
|
1005 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); |
|
1006 |
|
1007 stp1_4 = _mm_packs_epi32(tmp0, zero); |
|
1008 stp1_7 = _mm_packs_epi32(tmp2, zero); |
|
1009 stp1_5 = _mm_packs_epi32(tmp4, zero); |
|
1010 stp1_6 = _mm_packs_epi32(tmp6, zero); |
|
1011 } |
|
1012 |
|
1013 // Stage2 |
|
1014 { //NOLINT |
|
1015 const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2); |
|
1016 const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3); |
|
1017 |
|
1018 tmp0 = _mm_madd_epi16(lo_04, stg2_0); |
|
1019 tmp2 = _mm_madd_epi16(lo_04, stg2_1); |
|
1020 tmp4 = _mm_madd_epi16(lo_26, stg2_2); |
|
1021 tmp6 = _mm_madd_epi16(lo_26, stg2_3); |
|
1022 |
|
1023 tmp0 = _mm_add_epi32(tmp0, rounding); |
|
1024 tmp2 = _mm_add_epi32(tmp2, rounding); |
|
1025 tmp4 = _mm_add_epi32(tmp4, rounding); |
|
1026 tmp6 = _mm_add_epi32(tmp6, rounding); |
|
1027 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
|
1028 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
|
1029 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); |
|
1030 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); |
|
1031 |
|
1032 stp2_0 = _mm_packs_epi32(tmp0, zero); |
|
1033 stp2_1 = _mm_packs_epi32(tmp2, zero); |
|
1034 stp2_2 = _mm_packs_epi32(tmp4, zero); |
|
1035 stp2_3 = _mm_packs_epi32(tmp6, zero); |
|
1036 |
|
1037 stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); |
|
1038 stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); |
|
1039 stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); |
|
1040 stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); |
|
1041 } |
|
1042 |
|
1043 // Stage3 |
|
1044 { //NOLINT |
|
1045 const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); |
|
1046 stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); |
|
1047 stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); |
|
1048 stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); |
|
1049 stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); |
|
1050 |
|
1051 tmp0 = _mm_madd_epi16(lo_56, stg3_0); |
|
1052 tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0 |
|
1053 |
|
1054 tmp0 = _mm_add_epi32(tmp0, rounding); |
|
1055 tmp2 = _mm_add_epi32(tmp2, rounding); |
|
1056 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
|
1057 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
|
1058 |
|
1059 stp1_5 = _mm_packs_epi32(tmp0, zero); |
|
1060 stp1_6 = _mm_packs_epi32(tmp2, zero); |
|
1061 } |
|
1062 |
|
1063 // Stage4 |
|
1064 in0 = _mm_adds_epi16(stp1_0, stp2_7); |
|
1065 in1 = _mm_adds_epi16(stp1_1, stp1_6); |
|
1066 in2 = _mm_adds_epi16(stp1_2, stp1_5); |
|
1067 in3 = _mm_adds_epi16(stp1_3, stp2_4); |
|
1068 in4 = _mm_subs_epi16(stp1_3, stp2_4); |
|
1069 in5 = _mm_subs_epi16(stp1_2, stp1_5); |
|
1070 in6 = _mm_subs_epi16(stp1_1, stp1_6); |
|
1071 in7 = _mm_subs_epi16(stp1_0, stp2_7); |
|
1072 |
|
1073 // Columns. 4x8 Transpose |
|
1074 TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, |
|
1075 in4, in5, in6, in7) |
|
1076 |
|
1077 // 1D idct8x8 |
|
1078 IDCT8_1D |
|
1079 |
|
1080 // Final rounding and shift |
|
1081 in0 = _mm_adds_epi16(in0, final_rounding); |
|
1082 in1 = _mm_adds_epi16(in1, final_rounding); |
|
1083 in2 = _mm_adds_epi16(in2, final_rounding); |
|
1084 in3 = _mm_adds_epi16(in3, final_rounding); |
|
1085 in4 = _mm_adds_epi16(in4, final_rounding); |
|
1086 in5 = _mm_adds_epi16(in5, final_rounding); |
|
1087 in6 = _mm_adds_epi16(in6, final_rounding); |
|
1088 in7 = _mm_adds_epi16(in7, final_rounding); |
|
1089 |
|
1090 in0 = _mm_srai_epi16(in0, 5); |
|
1091 in1 = _mm_srai_epi16(in1, 5); |
|
1092 in2 = _mm_srai_epi16(in2, 5); |
|
1093 in3 = _mm_srai_epi16(in3, 5); |
|
1094 in4 = _mm_srai_epi16(in4, 5); |
|
1095 in5 = _mm_srai_epi16(in5, 5); |
|
1096 in6 = _mm_srai_epi16(in6, 5); |
|
1097 in7 = _mm_srai_epi16(in7, 5); |
|
1098 |
|
1099 RECON_AND_STORE(dest, in0); |
|
1100 RECON_AND_STORE(dest, in1); |
|
1101 RECON_AND_STORE(dest, in2); |
|
1102 RECON_AND_STORE(dest, in3); |
|
1103 RECON_AND_STORE(dest, in4); |
|
1104 RECON_AND_STORE(dest, in5); |
|
1105 RECON_AND_STORE(dest, in6); |
|
1106 RECON_AND_STORE(dest, in7); |
|
1107 } |
|
1108 |
|
1109 #define IDCT16_1D \ |
|
1110 /* Stage2 */ \ |
|
1111 { \ |
|
1112 const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \ |
|
1113 const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \ |
|
1114 const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \ |
|
1115 const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \ |
|
1116 const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \ |
|
1117 const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \ |
|
1118 const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \ |
|
1119 const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \ |
|
1120 \ |
|
1121 MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \ |
|
1122 stg2_0, stg2_1, stg2_2, stg2_3, \ |
|
1123 stp2_8, stp2_15, stp2_9, stp2_14) \ |
|
1124 \ |
|
1125 MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \ |
|
1126 stg2_4, stg2_5, stg2_6, stg2_7, \ |
|
1127 stp2_10, stp2_13, stp2_11, stp2_12) \ |
|
1128 } \ |
|
1129 \ |
|
1130 /* Stage3 */ \ |
|
1131 { \ |
|
1132 const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \ |
|
1133 const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \ |
|
1134 const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \ |
|
1135 const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \ |
|
1136 \ |
|
1137 MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \ |
|
1138 stg3_0, stg3_1, stg3_2, stg3_3, \ |
|
1139 stp1_4, stp1_7, stp1_5, stp1_6) \ |
|
1140 \ |
|
1141 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \ |
|
1142 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ |
|
1143 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ |
|
1144 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ |
|
1145 \ |
|
1146 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \ |
|
1147 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ |
|
1148 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ |
|
1149 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ |
|
1150 } \ |
|
1151 \ |
|
1152 /* Stage4 */ \ |
|
1153 { \ |
|
1154 const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \ |
|
1155 const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \ |
|
1156 const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \ |
|
1157 const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \ |
|
1158 \ |
|
1159 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ |
|
1160 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ |
|
1161 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ |
|
1162 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ |
|
1163 \ |
|
1164 MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \ |
|
1165 stg4_0, stg4_1, stg4_2, stg4_3, \ |
|
1166 stp2_0, stp2_1, stp2_2, stp2_3) \ |
|
1167 \ |
|
1168 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ |
|
1169 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ |
|
1170 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ |
|
1171 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ |
|
1172 \ |
|
1173 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \ |
|
1174 stg4_4, stg4_5, stg4_6, stg4_7, \ |
|
1175 stp2_9, stp2_14, stp2_10, stp2_13) \ |
|
1176 } \ |
|
1177 \ |
|
1178 /* Stage5 */ \ |
|
1179 { \ |
|
1180 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ |
|
1181 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ |
|
1182 \ |
|
1183 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ |
|
1184 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ |
|
1185 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ |
|
1186 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ |
|
1187 \ |
|
1188 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ |
|
1189 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ |
|
1190 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ |
|
1191 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ |
|
1192 \ |
|
1193 tmp0 = _mm_add_epi32(tmp0, rounding); \ |
|
1194 tmp1 = _mm_add_epi32(tmp1, rounding); \ |
|
1195 tmp2 = _mm_add_epi32(tmp2, rounding); \ |
|
1196 tmp3 = _mm_add_epi32(tmp3, rounding); \ |
|
1197 \ |
|
1198 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ |
|
1199 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ |
|
1200 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ |
|
1201 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ |
|
1202 \ |
|
1203 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ |
|
1204 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ |
|
1205 \ |
|
1206 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \ |
|
1207 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ |
|
1208 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ |
|
1209 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \ |
|
1210 \ |
|
1211 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \ |
|
1212 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ |
|
1213 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ |
|
1214 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \ |
|
1215 } \ |
|
1216 \ |
|
1217 /* Stage6 */ \ |
|
1218 { \ |
|
1219 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ |
|
1220 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ |
|
1221 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ |
|
1222 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ |
|
1223 \ |
|
1224 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \ |
|
1225 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ |
|
1226 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ |
|
1227 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \ |
|
1228 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \ |
|
1229 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ |
|
1230 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ |
|
1231 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \ |
|
1232 \ |
|
1233 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ |
|
1234 stg6_0, stg4_0, stg6_0, stg4_0, \ |
|
1235 stp2_10, stp2_13, stp2_11, stp2_12) \ |
|
1236 } |
|
1237 |
|
1238 void vp9_idct16x16_256_add_sse2(const int16_t *input, uint8_t *dest, |
|
1239 int stride) { |
|
1240 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
|
1241 const __m128i final_rounding = _mm_set1_epi16(1<<5); |
|
1242 const __m128i zero = _mm_setzero_si128(); |
|
1243 |
|
1244 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
|
1245 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); |
|
1246 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
|
1247 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); |
|
1248 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
|
1249 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); |
|
1250 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
|
1251 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); |
|
1252 |
|
1253 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
|
1254 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
|
1255 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); |
|
1256 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); |
|
1257 |
|
1258 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
|
1259 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
|
1260 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
|
1261 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
|
1262 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
|
1263 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); |
|
1264 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
|
1265 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
|
1266 |
|
1267 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
|
1268 |
|
1269 __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, |
|
1270 in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, |
|
1271 in10 = zero, in11 = zero, in12 = zero, in13 = zero, |
|
1272 in14 = zero, in15 = zero; |
|
1273 __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, |
|
1274 l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, |
|
1275 l12 = zero, l13 = zero, l14 = zero, l15 = zero; |
|
1276 __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero, |
|
1277 r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero, |
|
1278 r12 = zero, r13 = zero, r14 = zero, r15 = zero; |
|
1279 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, |
|
1280 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, |
|
1281 stp1_8_0, stp1_12_0; |
|
1282 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
|
1283 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; |
|
1284 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
|
1285 int i; |
|
1286 |
|
1287 // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct. |
|
1288 for (i = 0; i < 4; i++) { |
|
1289 // 1-D idct |
|
1290 if (i < 2) { |
|
1291 if (i == 1) input += 128; |
|
1292 |
|
1293 // Load input data. |
|
1294 in0 = _mm_load_si128((const __m128i *)input); |
|
1295 in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
|
1296 in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
|
1297 in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
|
1298 in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
|
1299 in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
|
1300 in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
|
1301 in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
|
1302 in4 = _mm_load_si128((const __m128i *)(input + 8 * 8)); |
|
1303 in12 = _mm_load_si128((const __m128i *)(input + 8 * 9)); |
|
1304 in5 = _mm_load_si128((const __m128i *)(input + 8 * 10)); |
|
1305 in13 = _mm_load_si128((const __m128i *)(input + 8 * 11)); |
|
1306 in6 = _mm_load_si128((const __m128i *)(input + 8 * 12)); |
|
1307 in14 = _mm_load_si128((const __m128i *)(input + 8 * 13)); |
|
1308 in7 = _mm_load_si128((const __m128i *)(input + 8 * 14)); |
|
1309 in15 = _mm_load_si128((const __m128i *)(input + 8 * 15)); |
|
1310 |
|
1311 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, |
|
1312 in4, in5, in6, in7); |
|
1313 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, |
|
1314 in10, in11, in12, in13, in14, in15); |
|
1315 } |
|
1316 |
|
1317 if (i == 2) { |
|
1318 TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, |
|
1319 in5, in6, in7); |
|
1320 TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12, |
|
1321 in13, in14, in15); |
|
1322 } |
|
1323 |
|
1324 if (i == 3) { |
|
1325 TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, |
|
1326 in4, in5, in6, in7); |
|
1327 TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11, |
|
1328 in12, in13, in14, in15); |
|
1329 } |
|
1330 |
|
1331 IDCT16_1D |
|
1332 |
|
1333 // Stage7 |
|
1334 if (i == 0) { |
|
1335 // Left 8x16 |
|
1336 l0 = _mm_add_epi16(stp2_0, stp1_15); |
|
1337 l1 = _mm_add_epi16(stp2_1, stp1_14); |
|
1338 l2 = _mm_add_epi16(stp2_2, stp2_13); |
|
1339 l3 = _mm_add_epi16(stp2_3, stp2_12); |
|
1340 l4 = _mm_add_epi16(stp2_4, stp2_11); |
|
1341 l5 = _mm_add_epi16(stp2_5, stp2_10); |
|
1342 l6 = _mm_add_epi16(stp2_6, stp1_9); |
|
1343 l7 = _mm_add_epi16(stp2_7, stp1_8); |
|
1344 l8 = _mm_sub_epi16(stp2_7, stp1_8); |
|
1345 l9 = _mm_sub_epi16(stp2_6, stp1_9); |
|
1346 l10 = _mm_sub_epi16(stp2_5, stp2_10); |
|
1347 l11 = _mm_sub_epi16(stp2_4, stp2_11); |
|
1348 l12 = _mm_sub_epi16(stp2_3, stp2_12); |
|
1349 l13 = _mm_sub_epi16(stp2_2, stp2_13); |
|
1350 l14 = _mm_sub_epi16(stp2_1, stp1_14); |
|
1351 l15 = _mm_sub_epi16(stp2_0, stp1_15); |
|
1352 } else if (i == 1) { |
|
1353 // Right 8x16 |
|
1354 r0 = _mm_add_epi16(stp2_0, stp1_15); |
|
1355 r1 = _mm_add_epi16(stp2_1, stp1_14); |
|
1356 r2 = _mm_add_epi16(stp2_2, stp2_13); |
|
1357 r3 = _mm_add_epi16(stp2_3, stp2_12); |
|
1358 r4 = _mm_add_epi16(stp2_4, stp2_11); |
|
1359 r5 = _mm_add_epi16(stp2_5, stp2_10); |
|
1360 r6 = _mm_add_epi16(stp2_6, stp1_9); |
|
1361 r7 = _mm_add_epi16(stp2_7, stp1_8); |
|
1362 r8 = _mm_sub_epi16(stp2_7, stp1_8); |
|
1363 r9 = _mm_sub_epi16(stp2_6, stp1_9); |
|
1364 r10 = _mm_sub_epi16(stp2_5, stp2_10); |
|
1365 r11 = _mm_sub_epi16(stp2_4, stp2_11); |
|
1366 r12 = _mm_sub_epi16(stp2_3, stp2_12); |
|
1367 r13 = _mm_sub_epi16(stp2_2, stp2_13); |
|
1368 r14 = _mm_sub_epi16(stp2_1, stp1_14); |
|
1369 r15 = _mm_sub_epi16(stp2_0, stp1_15); |
|
1370 } else { |
|
1371 // 2-D |
|
1372 in0 = _mm_add_epi16(stp2_0, stp1_15); |
|
1373 in1 = _mm_add_epi16(stp2_1, stp1_14); |
|
1374 in2 = _mm_add_epi16(stp2_2, stp2_13); |
|
1375 in3 = _mm_add_epi16(stp2_3, stp2_12); |
|
1376 in4 = _mm_add_epi16(stp2_4, stp2_11); |
|
1377 in5 = _mm_add_epi16(stp2_5, stp2_10); |
|
1378 in6 = _mm_add_epi16(stp2_6, stp1_9); |
|
1379 in7 = _mm_add_epi16(stp2_7, stp1_8); |
|
1380 in8 = _mm_sub_epi16(stp2_7, stp1_8); |
|
1381 in9 = _mm_sub_epi16(stp2_6, stp1_9); |
|
1382 in10 = _mm_sub_epi16(stp2_5, stp2_10); |
|
1383 in11 = _mm_sub_epi16(stp2_4, stp2_11); |
|
1384 in12 = _mm_sub_epi16(stp2_3, stp2_12); |
|
1385 in13 = _mm_sub_epi16(stp2_2, stp2_13); |
|
1386 in14 = _mm_sub_epi16(stp2_1, stp1_14); |
|
1387 in15 = _mm_sub_epi16(stp2_0, stp1_15); |
|
1388 |
|
1389 // Final rounding and shift |
|
1390 in0 = _mm_adds_epi16(in0, final_rounding); |
|
1391 in1 = _mm_adds_epi16(in1, final_rounding); |
|
1392 in2 = _mm_adds_epi16(in2, final_rounding); |
|
1393 in3 = _mm_adds_epi16(in3, final_rounding); |
|
1394 in4 = _mm_adds_epi16(in4, final_rounding); |
|
1395 in5 = _mm_adds_epi16(in5, final_rounding); |
|
1396 in6 = _mm_adds_epi16(in6, final_rounding); |
|
1397 in7 = _mm_adds_epi16(in7, final_rounding); |
|
1398 in8 = _mm_adds_epi16(in8, final_rounding); |
|
1399 in9 = _mm_adds_epi16(in9, final_rounding); |
|
1400 in10 = _mm_adds_epi16(in10, final_rounding); |
|
1401 in11 = _mm_adds_epi16(in11, final_rounding); |
|
1402 in12 = _mm_adds_epi16(in12, final_rounding); |
|
1403 in13 = _mm_adds_epi16(in13, final_rounding); |
|
1404 in14 = _mm_adds_epi16(in14, final_rounding); |
|
1405 in15 = _mm_adds_epi16(in15, final_rounding); |
|
1406 |
|
1407 in0 = _mm_srai_epi16(in0, 6); |
|
1408 in1 = _mm_srai_epi16(in1, 6); |
|
1409 in2 = _mm_srai_epi16(in2, 6); |
|
1410 in3 = _mm_srai_epi16(in3, 6); |
|
1411 in4 = _mm_srai_epi16(in4, 6); |
|
1412 in5 = _mm_srai_epi16(in5, 6); |
|
1413 in6 = _mm_srai_epi16(in6, 6); |
|
1414 in7 = _mm_srai_epi16(in7, 6); |
|
1415 in8 = _mm_srai_epi16(in8, 6); |
|
1416 in9 = _mm_srai_epi16(in9, 6); |
|
1417 in10 = _mm_srai_epi16(in10, 6); |
|
1418 in11 = _mm_srai_epi16(in11, 6); |
|
1419 in12 = _mm_srai_epi16(in12, 6); |
|
1420 in13 = _mm_srai_epi16(in13, 6); |
|
1421 in14 = _mm_srai_epi16(in14, 6); |
|
1422 in15 = _mm_srai_epi16(in15, 6); |
|
1423 |
|
1424 RECON_AND_STORE(dest, in0); |
|
1425 RECON_AND_STORE(dest, in1); |
|
1426 RECON_AND_STORE(dest, in2); |
|
1427 RECON_AND_STORE(dest, in3); |
|
1428 RECON_AND_STORE(dest, in4); |
|
1429 RECON_AND_STORE(dest, in5); |
|
1430 RECON_AND_STORE(dest, in6); |
|
1431 RECON_AND_STORE(dest, in7); |
|
1432 RECON_AND_STORE(dest, in8); |
|
1433 RECON_AND_STORE(dest, in9); |
|
1434 RECON_AND_STORE(dest, in10); |
|
1435 RECON_AND_STORE(dest, in11); |
|
1436 RECON_AND_STORE(dest, in12); |
|
1437 RECON_AND_STORE(dest, in13); |
|
1438 RECON_AND_STORE(dest, in14); |
|
1439 RECON_AND_STORE(dest, in15); |
|
1440 |
|
1441 dest += 8 - (stride * 16); |
|
1442 } |
|
1443 } |
|
1444 } |
|
1445 |
|
1446 void vp9_idct16x16_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
|
1447 __m128i dc_value; |
|
1448 const __m128i zero = _mm_setzero_si128(); |
|
1449 int a, i; |
|
1450 |
|
1451 a = dct_const_round_shift(input[0] * cospi_16_64); |
|
1452 a = dct_const_round_shift(a * cospi_16_64); |
|
1453 a = ROUND_POWER_OF_TWO(a, 6); |
|
1454 |
|
1455 dc_value = _mm_set1_epi16(a); |
|
1456 |
|
1457 for (i = 0; i < 2; ++i) { |
|
1458 RECON_AND_STORE(dest, dc_value); |
|
1459 RECON_AND_STORE(dest, dc_value); |
|
1460 RECON_AND_STORE(dest, dc_value); |
|
1461 RECON_AND_STORE(dest, dc_value); |
|
1462 RECON_AND_STORE(dest, dc_value); |
|
1463 RECON_AND_STORE(dest, dc_value); |
|
1464 RECON_AND_STORE(dest, dc_value); |
|
1465 RECON_AND_STORE(dest, dc_value); |
|
1466 RECON_AND_STORE(dest, dc_value); |
|
1467 RECON_AND_STORE(dest, dc_value); |
|
1468 RECON_AND_STORE(dest, dc_value); |
|
1469 RECON_AND_STORE(dest, dc_value); |
|
1470 RECON_AND_STORE(dest, dc_value); |
|
1471 RECON_AND_STORE(dest, dc_value); |
|
1472 RECON_AND_STORE(dest, dc_value); |
|
1473 RECON_AND_STORE(dest, dc_value); |
|
1474 dest += 8 - (stride * 16); |
|
1475 } |
|
1476 } |
|
1477 |
|
1478 static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { |
|
1479 __m128i tbuf[8]; |
|
1480 array_transpose_8x8(res0, res0); |
|
1481 array_transpose_8x8(res1, tbuf); |
|
1482 array_transpose_8x8(res0 + 8, res1); |
|
1483 array_transpose_8x8(res1 + 8, res1 + 8); |
|
1484 |
|
1485 res0[8] = tbuf[0]; |
|
1486 res0[9] = tbuf[1]; |
|
1487 res0[10] = tbuf[2]; |
|
1488 res0[11] = tbuf[3]; |
|
1489 res0[12] = tbuf[4]; |
|
1490 res0[13] = tbuf[5]; |
|
1491 res0[14] = tbuf[6]; |
|
1492 res0[15] = tbuf[7]; |
|
1493 } |
|
1494 |
|
1495 static void iadst16_1d_8col(__m128i *in) { |
|
1496 // perform 16x16 1-D ADST for 8 columns |
|
1497 __m128i s[16], x[16], u[32], v[32]; |
|
1498 const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); |
|
1499 const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
|
1500 const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); |
|
1501 const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); |
|
1502 const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); |
|
1503 const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); |
|
1504 const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); |
|
1505 const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); |
|
1506 const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); |
|
1507 const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); |
|
1508 const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); |
|
1509 const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); |
|
1510 const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); |
|
1511 const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); |
|
1512 const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); |
|
1513 const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); |
|
1514 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); |
|
1515 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
|
1516 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); |
|
1517 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); |
|
1518 const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); |
|
1519 const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); |
|
1520 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
|
1521 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
|
1522 const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); |
|
1523 const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); |
|
1524 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
|
1525 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
|
1526 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
|
1527 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
|
1528 const __m128i kZero = _mm_set1_epi16(0); |
|
1529 |
|
1530 u[0] = _mm_unpacklo_epi16(in[15], in[0]); |
|
1531 u[1] = _mm_unpackhi_epi16(in[15], in[0]); |
|
1532 u[2] = _mm_unpacklo_epi16(in[13], in[2]); |
|
1533 u[3] = _mm_unpackhi_epi16(in[13], in[2]); |
|
1534 u[4] = _mm_unpacklo_epi16(in[11], in[4]); |
|
1535 u[5] = _mm_unpackhi_epi16(in[11], in[4]); |
|
1536 u[6] = _mm_unpacklo_epi16(in[9], in[6]); |
|
1537 u[7] = _mm_unpackhi_epi16(in[9], in[6]); |
|
1538 u[8] = _mm_unpacklo_epi16(in[7], in[8]); |
|
1539 u[9] = _mm_unpackhi_epi16(in[7], in[8]); |
|
1540 u[10] = _mm_unpacklo_epi16(in[5], in[10]); |
|
1541 u[11] = _mm_unpackhi_epi16(in[5], in[10]); |
|
1542 u[12] = _mm_unpacklo_epi16(in[3], in[12]); |
|
1543 u[13] = _mm_unpackhi_epi16(in[3], in[12]); |
|
1544 u[14] = _mm_unpacklo_epi16(in[1], in[14]); |
|
1545 u[15] = _mm_unpackhi_epi16(in[1], in[14]); |
|
1546 |
|
1547 v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); |
|
1548 v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); |
|
1549 v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); |
|
1550 v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); |
|
1551 v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); |
|
1552 v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); |
|
1553 v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); |
|
1554 v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); |
|
1555 v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); |
|
1556 v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); |
|
1557 v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); |
|
1558 v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); |
|
1559 v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); |
|
1560 v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); |
|
1561 v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); |
|
1562 v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); |
|
1563 v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); |
|
1564 v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); |
|
1565 v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); |
|
1566 v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); |
|
1567 v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); |
|
1568 v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); |
|
1569 v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); |
|
1570 v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); |
|
1571 v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); |
|
1572 v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); |
|
1573 v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); |
|
1574 v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); |
|
1575 v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); |
|
1576 v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); |
|
1577 v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); |
|
1578 v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); |
|
1579 |
|
1580 u[0] = _mm_add_epi32(v[0], v[16]); |
|
1581 u[1] = _mm_add_epi32(v[1], v[17]); |
|
1582 u[2] = _mm_add_epi32(v[2], v[18]); |
|
1583 u[3] = _mm_add_epi32(v[3], v[19]); |
|
1584 u[4] = _mm_add_epi32(v[4], v[20]); |
|
1585 u[5] = _mm_add_epi32(v[5], v[21]); |
|
1586 u[6] = _mm_add_epi32(v[6], v[22]); |
|
1587 u[7] = _mm_add_epi32(v[7], v[23]); |
|
1588 u[8] = _mm_add_epi32(v[8], v[24]); |
|
1589 u[9] = _mm_add_epi32(v[9], v[25]); |
|
1590 u[10] = _mm_add_epi32(v[10], v[26]); |
|
1591 u[11] = _mm_add_epi32(v[11], v[27]); |
|
1592 u[12] = _mm_add_epi32(v[12], v[28]); |
|
1593 u[13] = _mm_add_epi32(v[13], v[29]); |
|
1594 u[14] = _mm_add_epi32(v[14], v[30]); |
|
1595 u[15] = _mm_add_epi32(v[15], v[31]); |
|
1596 u[16] = _mm_sub_epi32(v[0], v[16]); |
|
1597 u[17] = _mm_sub_epi32(v[1], v[17]); |
|
1598 u[18] = _mm_sub_epi32(v[2], v[18]); |
|
1599 u[19] = _mm_sub_epi32(v[3], v[19]); |
|
1600 u[20] = _mm_sub_epi32(v[4], v[20]); |
|
1601 u[21] = _mm_sub_epi32(v[5], v[21]); |
|
1602 u[22] = _mm_sub_epi32(v[6], v[22]); |
|
1603 u[23] = _mm_sub_epi32(v[7], v[23]); |
|
1604 u[24] = _mm_sub_epi32(v[8], v[24]); |
|
1605 u[25] = _mm_sub_epi32(v[9], v[25]); |
|
1606 u[26] = _mm_sub_epi32(v[10], v[26]); |
|
1607 u[27] = _mm_sub_epi32(v[11], v[27]); |
|
1608 u[28] = _mm_sub_epi32(v[12], v[28]); |
|
1609 u[29] = _mm_sub_epi32(v[13], v[29]); |
|
1610 u[30] = _mm_sub_epi32(v[14], v[30]); |
|
1611 u[31] = _mm_sub_epi32(v[15], v[31]); |
|
1612 |
|
1613 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); |
|
1614 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); |
|
1615 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); |
|
1616 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); |
|
1617 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); |
|
1618 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); |
|
1619 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); |
|
1620 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); |
|
1621 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); |
|
1622 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); |
|
1623 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); |
|
1624 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); |
|
1625 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); |
|
1626 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); |
|
1627 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); |
|
1628 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); |
|
1629 v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); |
|
1630 v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); |
|
1631 v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); |
|
1632 v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); |
|
1633 v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); |
|
1634 v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); |
|
1635 v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); |
|
1636 v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); |
|
1637 v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); |
|
1638 v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); |
|
1639 v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); |
|
1640 v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); |
|
1641 v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); |
|
1642 v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); |
|
1643 v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); |
|
1644 v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); |
|
1645 |
|
1646 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
|
1647 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
|
1648 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
|
1649 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
|
1650 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); |
|
1651 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); |
|
1652 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); |
|
1653 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); |
|
1654 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); |
|
1655 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); |
|
1656 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); |
|
1657 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); |
|
1658 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); |
|
1659 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); |
|
1660 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); |
|
1661 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); |
|
1662 u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); |
|
1663 u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); |
|
1664 u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); |
|
1665 u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); |
|
1666 u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); |
|
1667 u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); |
|
1668 u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); |
|
1669 u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); |
|
1670 u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); |
|
1671 u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); |
|
1672 u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); |
|
1673 u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); |
|
1674 u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); |
|
1675 u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); |
|
1676 u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); |
|
1677 u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); |
|
1678 |
|
1679 s[0] = _mm_packs_epi32(u[0], u[1]); |
|
1680 s[1] = _mm_packs_epi32(u[2], u[3]); |
|
1681 s[2] = _mm_packs_epi32(u[4], u[5]); |
|
1682 s[3] = _mm_packs_epi32(u[6], u[7]); |
|
1683 s[4] = _mm_packs_epi32(u[8], u[9]); |
|
1684 s[5] = _mm_packs_epi32(u[10], u[11]); |
|
1685 s[6] = _mm_packs_epi32(u[12], u[13]); |
|
1686 s[7] = _mm_packs_epi32(u[14], u[15]); |
|
1687 s[8] = _mm_packs_epi32(u[16], u[17]); |
|
1688 s[9] = _mm_packs_epi32(u[18], u[19]); |
|
1689 s[10] = _mm_packs_epi32(u[20], u[21]); |
|
1690 s[11] = _mm_packs_epi32(u[22], u[23]); |
|
1691 s[12] = _mm_packs_epi32(u[24], u[25]); |
|
1692 s[13] = _mm_packs_epi32(u[26], u[27]); |
|
1693 s[14] = _mm_packs_epi32(u[28], u[29]); |
|
1694 s[15] = _mm_packs_epi32(u[30], u[31]); |
|
1695 |
|
1696 // stage 2 |
|
1697 u[0] = _mm_unpacklo_epi16(s[8], s[9]); |
|
1698 u[1] = _mm_unpackhi_epi16(s[8], s[9]); |
|
1699 u[2] = _mm_unpacklo_epi16(s[10], s[11]); |
|
1700 u[3] = _mm_unpackhi_epi16(s[10], s[11]); |
|
1701 u[4] = _mm_unpacklo_epi16(s[12], s[13]); |
|
1702 u[5] = _mm_unpackhi_epi16(s[12], s[13]); |
|
1703 u[6] = _mm_unpacklo_epi16(s[14], s[15]); |
|
1704 u[7] = _mm_unpackhi_epi16(s[14], s[15]); |
|
1705 |
|
1706 v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); |
|
1707 v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); |
|
1708 v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); |
|
1709 v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); |
|
1710 v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); |
|
1711 v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); |
|
1712 v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); |
|
1713 v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); |
|
1714 v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); |
|
1715 v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); |
|
1716 v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); |
|
1717 v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); |
|
1718 v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); |
|
1719 v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); |
|
1720 v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); |
|
1721 v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); |
|
1722 |
|
1723 u[0] = _mm_add_epi32(v[0], v[8]); |
|
1724 u[1] = _mm_add_epi32(v[1], v[9]); |
|
1725 u[2] = _mm_add_epi32(v[2], v[10]); |
|
1726 u[3] = _mm_add_epi32(v[3], v[11]); |
|
1727 u[4] = _mm_add_epi32(v[4], v[12]); |
|
1728 u[5] = _mm_add_epi32(v[5], v[13]); |
|
1729 u[6] = _mm_add_epi32(v[6], v[14]); |
|
1730 u[7] = _mm_add_epi32(v[7], v[15]); |
|
1731 u[8] = _mm_sub_epi32(v[0], v[8]); |
|
1732 u[9] = _mm_sub_epi32(v[1], v[9]); |
|
1733 u[10] = _mm_sub_epi32(v[2], v[10]); |
|
1734 u[11] = _mm_sub_epi32(v[3], v[11]); |
|
1735 u[12] = _mm_sub_epi32(v[4], v[12]); |
|
1736 u[13] = _mm_sub_epi32(v[5], v[13]); |
|
1737 u[14] = _mm_sub_epi32(v[6], v[14]); |
|
1738 u[15] = _mm_sub_epi32(v[7], v[15]); |
|
1739 |
|
1740 v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); |
|
1741 v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); |
|
1742 v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); |
|
1743 v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); |
|
1744 v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); |
|
1745 v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); |
|
1746 v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); |
|
1747 v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); |
|
1748 v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); |
|
1749 v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); |
|
1750 v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); |
|
1751 v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); |
|
1752 v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); |
|
1753 v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); |
|
1754 v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); |
|
1755 v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); |
|
1756 |
|
1757 u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); |
|
1758 u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); |
|
1759 u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); |
|
1760 u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); |
|
1761 u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); |
|
1762 u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); |
|
1763 u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); |
|
1764 u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); |
|
1765 u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); |
|
1766 u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); |
|
1767 u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); |
|
1768 u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); |
|
1769 u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); |
|
1770 u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); |
|
1771 u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); |
|
1772 u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); |
|
1773 |
|
1774 x[0] = _mm_add_epi16(s[0], s[4]); |
|
1775 x[1] = _mm_add_epi16(s[1], s[5]); |
|
1776 x[2] = _mm_add_epi16(s[2], s[6]); |
|
1777 x[3] = _mm_add_epi16(s[3], s[7]); |
|
1778 x[4] = _mm_sub_epi16(s[0], s[4]); |
|
1779 x[5] = _mm_sub_epi16(s[1], s[5]); |
|
1780 x[6] = _mm_sub_epi16(s[2], s[6]); |
|
1781 x[7] = _mm_sub_epi16(s[3], s[7]); |
|
1782 x[8] = _mm_packs_epi32(u[0], u[1]); |
|
1783 x[9] = _mm_packs_epi32(u[2], u[3]); |
|
1784 x[10] = _mm_packs_epi32(u[4], u[5]); |
|
1785 x[11] = _mm_packs_epi32(u[6], u[7]); |
|
1786 x[12] = _mm_packs_epi32(u[8], u[9]); |
|
1787 x[13] = _mm_packs_epi32(u[10], u[11]); |
|
1788 x[14] = _mm_packs_epi32(u[12], u[13]); |
|
1789 x[15] = _mm_packs_epi32(u[14], u[15]); |
|
1790 |
|
1791 // stage 3 |
|
1792 u[0] = _mm_unpacklo_epi16(x[4], x[5]); |
|
1793 u[1] = _mm_unpackhi_epi16(x[4], x[5]); |
|
1794 u[2] = _mm_unpacklo_epi16(x[6], x[7]); |
|
1795 u[3] = _mm_unpackhi_epi16(x[6], x[7]); |
|
1796 u[4] = _mm_unpacklo_epi16(x[12], x[13]); |
|
1797 u[5] = _mm_unpackhi_epi16(x[12], x[13]); |
|
1798 u[6] = _mm_unpacklo_epi16(x[14], x[15]); |
|
1799 u[7] = _mm_unpackhi_epi16(x[14], x[15]); |
|
1800 |
|
1801 v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); |
|
1802 v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); |
|
1803 v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); |
|
1804 v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); |
|
1805 v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); |
|
1806 v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); |
|
1807 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); |
|
1808 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); |
|
1809 v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); |
|
1810 v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); |
|
1811 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); |
|
1812 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); |
|
1813 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); |
|
1814 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); |
|
1815 v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); |
|
1816 v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); |
|
1817 |
|
1818 u[0] = _mm_add_epi32(v[0], v[4]); |
|
1819 u[1] = _mm_add_epi32(v[1], v[5]); |
|
1820 u[2] = _mm_add_epi32(v[2], v[6]); |
|
1821 u[3] = _mm_add_epi32(v[3], v[7]); |
|
1822 u[4] = _mm_sub_epi32(v[0], v[4]); |
|
1823 u[5] = _mm_sub_epi32(v[1], v[5]); |
|
1824 u[6] = _mm_sub_epi32(v[2], v[6]); |
|
1825 u[7] = _mm_sub_epi32(v[3], v[7]); |
|
1826 u[8] = _mm_add_epi32(v[8], v[12]); |
|
1827 u[9] = _mm_add_epi32(v[9], v[13]); |
|
1828 u[10] = _mm_add_epi32(v[10], v[14]); |
|
1829 u[11] = _mm_add_epi32(v[11], v[15]); |
|
1830 u[12] = _mm_sub_epi32(v[8], v[12]); |
|
1831 u[13] = _mm_sub_epi32(v[9], v[13]); |
|
1832 u[14] = _mm_sub_epi32(v[10], v[14]); |
|
1833 u[15] = _mm_sub_epi32(v[11], v[15]); |
|
1834 |
|
1835 u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); |
|
1836 u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); |
|
1837 u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); |
|
1838 u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); |
|
1839 u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); |
|
1840 u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); |
|
1841 u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); |
|
1842 u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); |
|
1843 u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); |
|
1844 u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); |
|
1845 u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); |
|
1846 u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); |
|
1847 u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); |
|
1848 u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); |
|
1849 u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); |
|
1850 u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); |
|
1851 |
|
1852 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
|
1853 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
|
1854 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
|
1855 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
|
1856 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); |
|
1857 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); |
|
1858 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); |
|
1859 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); |
|
1860 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); |
|
1861 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); |
|
1862 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); |
|
1863 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); |
|
1864 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); |
|
1865 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); |
|
1866 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); |
|
1867 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); |
|
1868 |
|
1869 s[0] = _mm_add_epi16(x[0], x[2]); |
|
1870 s[1] = _mm_add_epi16(x[1], x[3]); |
|
1871 s[2] = _mm_sub_epi16(x[0], x[2]); |
|
1872 s[3] = _mm_sub_epi16(x[1], x[3]); |
|
1873 s[4] = _mm_packs_epi32(v[0], v[1]); |
|
1874 s[5] = _mm_packs_epi32(v[2], v[3]); |
|
1875 s[6] = _mm_packs_epi32(v[4], v[5]); |
|
1876 s[7] = _mm_packs_epi32(v[6], v[7]); |
|
1877 s[8] = _mm_add_epi16(x[8], x[10]); |
|
1878 s[9] = _mm_add_epi16(x[9], x[11]); |
|
1879 s[10] = _mm_sub_epi16(x[8], x[10]); |
|
1880 s[11] = _mm_sub_epi16(x[9], x[11]); |
|
1881 s[12] = _mm_packs_epi32(v[8], v[9]); |
|
1882 s[13] = _mm_packs_epi32(v[10], v[11]); |
|
1883 s[14] = _mm_packs_epi32(v[12], v[13]); |
|
1884 s[15] = _mm_packs_epi32(v[14], v[15]); |
|
1885 |
|
1886 // stage 4 |
|
1887 u[0] = _mm_unpacklo_epi16(s[2], s[3]); |
|
1888 u[1] = _mm_unpackhi_epi16(s[2], s[3]); |
|
1889 u[2] = _mm_unpacklo_epi16(s[6], s[7]); |
|
1890 u[3] = _mm_unpackhi_epi16(s[6], s[7]); |
|
1891 u[4] = _mm_unpacklo_epi16(s[10], s[11]); |
|
1892 u[5] = _mm_unpackhi_epi16(s[10], s[11]); |
|
1893 u[6] = _mm_unpacklo_epi16(s[14], s[15]); |
|
1894 u[7] = _mm_unpackhi_epi16(s[14], s[15]); |
|
1895 |
|
1896 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); |
|
1897 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); |
|
1898 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); |
|
1899 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); |
|
1900 v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); |
|
1901 v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); |
|
1902 v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); |
|
1903 v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); |
|
1904 v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); |
|
1905 v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); |
|
1906 v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); |
|
1907 v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); |
|
1908 v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); |
|
1909 v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); |
|
1910 v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); |
|
1911 v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); |
|
1912 |
|
1913 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); |
|
1914 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); |
|
1915 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); |
|
1916 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); |
|
1917 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); |
|
1918 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); |
|
1919 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); |
|
1920 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); |
|
1921 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); |
|
1922 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); |
|
1923 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); |
|
1924 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); |
|
1925 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); |
|
1926 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); |
|
1927 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); |
|
1928 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); |
|
1929 |
|
1930 v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
|
1931 v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
|
1932 v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
|
1933 v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
|
1934 v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); |
|
1935 v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); |
|
1936 v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); |
|
1937 v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); |
|
1938 v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); |
|
1939 v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); |
|
1940 v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); |
|
1941 v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); |
|
1942 v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); |
|
1943 v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); |
|
1944 v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); |
|
1945 v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); |
|
1946 |
|
1947 in[0] = s[0]; |
|
1948 in[1] = _mm_sub_epi16(kZero, s[8]); |
|
1949 in[2] = s[12]; |
|
1950 in[3] = _mm_sub_epi16(kZero, s[4]); |
|
1951 in[4] = _mm_packs_epi32(v[4], v[5]); |
|
1952 in[5] = _mm_packs_epi32(v[12], v[13]); |
|
1953 in[6] = _mm_packs_epi32(v[8], v[9]); |
|
1954 in[7] = _mm_packs_epi32(v[0], v[1]); |
|
1955 in[8] = _mm_packs_epi32(v[2], v[3]); |
|
1956 in[9] = _mm_packs_epi32(v[10], v[11]); |
|
1957 in[10] = _mm_packs_epi32(v[14], v[15]); |
|
1958 in[11] = _mm_packs_epi32(v[6], v[7]); |
|
1959 in[12] = s[5]; |
|
1960 in[13] = _mm_sub_epi16(kZero, s[13]); |
|
1961 in[14] = s[9]; |
|
1962 in[15] = _mm_sub_epi16(kZero, s[1]); |
|
1963 } |
|
1964 |
|
1965 static void idct16_1d_8col(__m128i *in) { |
|
1966 const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
|
1967 const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); |
|
1968 const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
|
1969 const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); |
|
1970 const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
|
1971 const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); |
|
1972 const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
|
1973 const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); |
|
1974 const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
|
1975 const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); |
|
1976 const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); |
|
1977 const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); |
|
1978 const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); |
|
1979 const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
|
1980 const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
|
1981 const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); |
|
1982 const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
|
1983 const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); |
|
1984 const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
|
1985 const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
|
1986 const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); |
|
1987 __m128i v[16], u[16], s[16], t[16]; |
|
1988 |
|
1989 // stage 1 |
|
1990 s[0] = in[0]; |
|
1991 s[1] = in[8]; |
|
1992 s[2] = in[4]; |
|
1993 s[3] = in[12]; |
|
1994 s[4] = in[2]; |
|
1995 s[5] = in[10]; |
|
1996 s[6] = in[6]; |
|
1997 s[7] = in[14]; |
|
1998 s[8] = in[1]; |
|
1999 s[9] = in[9]; |
|
2000 s[10] = in[5]; |
|
2001 s[11] = in[13]; |
|
2002 s[12] = in[3]; |
|
2003 s[13] = in[11]; |
|
2004 s[14] = in[7]; |
|
2005 s[15] = in[15]; |
|
2006 |
|
2007 // stage 2 |
|
2008 u[0] = _mm_unpacklo_epi16(s[8], s[15]); |
|
2009 u[1] = _mm_unpackhi_epi16(s[8], s[15]); |
|
2010 u[2] = _mm_unpacklo_epi16(s[9], s[14]); |
|
2011 u[3] = _mm_unpackhi_epi16(s[9], s[14]); |
|
2012 u[4] = _mm_unpacklo_epi16(s[10], s[13]); |
|
2013 u[5] = _mm_unpackhi_epi16(s[10], s[13]); |
|
2014 u[6] = _mm_unpacklo_epi16(s[11], s[12]); |
|
2015 u[7] = _mm_unpackhi_epi16(s[11], s[12]); |
|
2016 |
|
2017 v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02); |
|
2018 v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02); |
|
2019 v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30); |
|
2020 v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30); |
|
2021 v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18); |
|
2022 v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18); |
|
2023 v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14); |
|
2024 v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14); |
|
2025 v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10); |
|
2026 v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10); |
|
2027 v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22); |
|
2028 v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22); |
|
2029 v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26); |
|
2030 v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26); |
|
2031 v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06); |
|
2032 v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06); |
|
2033 |
|
2034 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); |
|
2035 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); |
|
2036 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); |
|
2037 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); |
|
2038 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); |
|
2039 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); |
|
2040 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); |
|
2041 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); |
|
2042 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); |
|
2043 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); |
|
2044 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); |
|
2045 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); |
|
2046 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); |
|
2047 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); |
|
2048 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); |
|
2049 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); |
|
2050 |
|
2051 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
|
2052 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
|
2053 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
|
2054 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
|
2055 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); |
|
2056 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); |
|
2057 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); |
|
2058 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); |
|
2059 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); |
|
2060 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); |
|
2061 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); |
|
2062 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); |
|
2063 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); |
|
2064 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); |
|
2065 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); |
|
2066 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); |
|
2067 |
|
2068 s[8] = _mm_packs_epi32(u[0], u[1]); |
|
2069 s[15] = _mm_packs_epi32(u[2], u[3]); |
|
2070 s[9] = _mm_packs_epi32(u[4], u[5]); |
|
2071 s[14] = _mm_packs_epi32(u[6], u[7]); |
|
2072 s[10] = _mm_packs_epi32(u[8], u[9]); |
|
2073 s[13] = _mm_packs_epi32(u[10], u[11]); |
|
2074 s[11] = _mm_packs_epi32(u[12], u[13]); |
|
2075 s[12] = _mm_packs_epi32(u[14], u[15]); |
|
2076 |
|
2077 // stage 3 |
|
2078 t[0] = s[0]; |
|
2079 t[1] = s[1]; |
|
2080 t[2] = s[2]; |
|
2081 t[3] = s[3]; |
|
2082 u[0] = _mm_unpacklo_epi16(s[4], s[7]); |
|
2083 u[1] = _mm_unpackhi_epi16(s[4], s[7]); |
|
2084 u[2] = _mm_unpacklo_epi16(s[5], s[6]); |
|
2085 u[3] = _mm_unpackhi_epi16(s[5], s[6]); |
|
2086 |
|
2087 v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04); |
|
2088 v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04); |
|
2089 v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28); |
|
2090 v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28); |
|
2091 v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20); |
|
2092 v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20); |
|
2093 v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12); |
|
2094 v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12); |
|
2095 |
|
2096 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); |
|
2097 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); |
|
2098 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); |
|
2099 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); |
|
2100 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); |
|
2101 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); |
|
2102 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); |
|
2103 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); |
|
2104 |
|
2105 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
|
2106 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
|
2107 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
|
2108 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
|
2109 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); |
|
2110 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); |
|
2111 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); |
|
2112 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); |
|
2113 |
|
2114 t[4] = _mm_packs_epi32(u[0], u[1]); |
|
2115 t[7] = _mm_packs_epi32(u[2], u[3]); |
|
2116 t[5] = _mm_packs_epi32(u[4], u[5]); |
|
2117 t[6] = _mm_packs_epi32(u[6], u[7]); |
|
2118 t[8] = _mm_add_epi16(s[8], s[9]); |
|
2119 t[9] = _mm_sub_epi16(s[8], s[9]); |
|
2120 t[10] = _mm_sub_epi16(s[11], s[10]); |
|
2121 t[11] = _mm_add_epi16(s[10], s[11]); |
|
2122 t[12] = _mm_add_epi16(s[12], s[13]); |
|
2123 t[13] = _mm_sub_epi16(s[12], s[13]); |
|
2124 t[14] = _mm_sub_epi16(s[15], s[14]); |
|
2125 t[15] = _mm_add_epi16(s[14], s[15]); |
|
2126 |
|
2127 // stage 4 |
|
2128 u[0] = _mm_unpacklo_epi16(t[0], t[1]); |
|
2129 u[1] = _mm_unpackhi_epi16(t[0], t[1]); |
|
2130 u[2] = _mm_unpacklo_epi16(t[2], t[3]); |
|
2131 u[3] = _mm_unpackhi_epi16(t[2], t[3]); |
|
2132 u[4] = _mm_unpacklo_epi16(t[9], t[14]); |
|
2133 u[5] = _mm_unpackhi_epi16(t[9], t[14]); |
|
2134 u[6] = _mm_unpacklo_epi16(t[10], t[13]); |
|
2135 u[7] = _mm_unpackhi_epi16(t[10], t[13]); |
|
2136 |
|
2137 v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16); |
|
2138 v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16); |
|
2139 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); |
|
2140 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); |
|
2141 v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08); |
|
2142 v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08); |
|
2143 v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); |
|
2144 v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); |
|
2145 v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24); |
|
2146 v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24); |
|
2147 v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08); |
|
2148 v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08); |
|
2149 v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08); |
|
2150 v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08); |
|
2151 v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24); |
|
2152 v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24); |
|
2153 |
|
2154 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); |
|
2155 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); |
|
2156 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); |
|
2157 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); |
|
2158 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); |
|
2159 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); |
|
2160 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); |
|
2161 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); |
|
2162 u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); |
|
2163 u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); |
|
2164 u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); |
|
2165 u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); |
|
2166 u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); |
|
2167 u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); |
|
2168 u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); |
|
2169 u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); |
|
2170 |
|
2171 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
|
2172 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
|
2173 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
|
2174 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
|
2175 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); |
|
2176 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); |
|
2177 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); |
|
2178 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); |
|
2179 u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); |
|
2180 u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); |
|
2181 u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); |
|
2182 u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); |
|
2183 u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); |
|
2184 u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); |
|
2185 u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); |
|
2186 u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); |
|
2187 |
|
2188 s[0] = _mm_packs_epi32(u[0], u[1]); |
|
2189 s[1] = _mm_packs_epi32(u[2], u[3]); |
|
2190 s[2] = _mm_packs_epi32(u[4], u[5]); |
|
2191 s[3] = _mm_packs_epi32(u[6], u[7]); |
|
2192 s[4] = _mm_add_epi16(t[4], t[5]); |
|
2193 s[5] = _mm_sub_epi16(t[4], t[5]); |
|
2194 s[6] = _mm_sub_epi16(t[7], t[6]); |
|
2195 s[7] = _mm_add_epi16(t[6], t[7]); |
|
2196 s[8] = t[8]; |
|
2197 s[15] = t[15]; |
|
2198 s[9] = _mm_packs_epi32(u[8], u[9]); |
|
2199 s[14] = _mm_packs_epi32(u[10], u[11]); |
|
2200 s[10] = _mm_packs_epi32(u[12], u[13]); |
|
2201 s[13] = _mm_packs_epi32(u[14], u[15]); |
|
2202 s[11] = t[11]; |
|
2203 s[12] = t[12]; |
|
2204 |
|
2205 // stage 5 |
|
2206 t[0] = _mm_add_epi16(s[0], s[3]); |
|
2207 t[1] = _mm_add_epi16(s[1], s[2]); |
|
2208 t[2] = _mm_sub_epi16(s[1], s[2]); |
|
2209 t[3] = _mm_sub_epi16(s[0], s[3]); |
|
2210 t[4] = s[4]; |
|
2211 t[7] = s[7]; |
|
2212 |
|
2213 u[0] = _mm_unpacklo_epi16(s[5], s[6]); |
|
2214 u[1] = _mm_unpackhi_epi16(s[5], s[6]); |
|
2215 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); |
|
2216 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); |
|
2217 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); |
|
2218 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); |
|
2219 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); |
|
2220 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); |
|
2221 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); |
|
2222 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); |
|
2223 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
|
2224 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
|
2225 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
|
2226 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
|
2227 t[5] = _mm_packs_epi32(u[0], u[1]); |
|
2228 t[6] = _mm_packs_epi32(u[2], u[3]); |
|
2229 |
|
2230 t[8] = _mm_add_epi16(s[8], s[11]); |
|
2231 t[9] = _mm_add_epi16(s[9], s[10]); |
|
2232 t[10] = _mm_sub_epi16(s[9], s[10]); |
|
2233 t[11] = _mm_sub_epi16(s[8], s[11]); |
|
2234 t[12] = _mm_sub_epi16(s[15], s[12]); |
|
2235 t[13] = _mm_sub_epi16(s[14], s[13]); |
|
2236 t[14] = _mm_add_epi16(s[13], s[14]); |
|
2237 t[15] = _mm_add_epi16(s[12], s[15]); |
|
2238 |
|
2239 // stage 6 |
|
2240 s[0] = _mm_add_epi16(t[0], t[7]); |
|
2241 s[1] = _mm_add_epi16(t[1], t[6]); |
|
2242 s[2] = _mm_add_epi16(t[2], t[5]); |
|
2243 s[3] = _mm_add_epi16(t[3], t[4]); |
|
2244 s[4] = _mm_sub_epi16(t[3], t[4]); |
|
2245 s[5] = _mm_sub_epi16(t[2], t[5]); |
|
2246 s[6] = _mm_sub_epi16(t[1], t[6]); |
|
2247 s[7] = _mm_sub_epi16(t[0], t[7]); |
|
2248 s[8] = t[8]; |
|
2249 s[9] = t[9]; |
|
2250 |
|
2251 u[0] = _mm_unpacklo_epi16(t[10], t[13]); |
|
2252 u[1] = _mm_unpackhi_epi16(t[10], t[13]); |
|
2253 u[2] = _mm_unpacklo_epi16(t[11], t[12]); |
|
2254 u[3] = _mm_unpackhi_epi16(t[11], t[12]); |
|
2255 |
|
2256 v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); |
|
2257 v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); |
|
2258 v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); |
|
2259 v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); |
|
2260 v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16); |
|
2261 v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16); |
|
2262 v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16); |
|
2263 v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16); |
|
2264 |
|
2265 u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); |
|
2266 u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); |
|
2267 u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); |
|
2268 u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); |
|
2269 u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); |
|
2270 u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); |
|
2271 u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); |
|
2272 u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); |
|
2273 |
|
2274 u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); |
|
2275 u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); |
|
2276 u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); |
|
2277 u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); |
|
2278 u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); |
|
2279 u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); |
|
2280 u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); |
|
2281 u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); |
|
2282 |
|
2283 s[10] = _mm_packs_epi32(u[0], u[1]); |
|
2284 s[13] = _mm_packs_epi32(u[2], u[3]); |
|
2285 s[11] = _mm_packs_epi32(u[4], u[5]); |
|
2286 s[12] = _mm_packs_epi32(u[6], u[7]); |
|
2287 s[14] = t[14]; |
|
2288 s[15] = t[15]; |
|
2289 |
|
2290 // stage 7 |
|
2291 in[0] = _mm_add_epi16(s[0], s[15]); |
|
2292 in[1] = _mm_add_epi16(s[1], s[14]); |
|
2293 in[2] = _mm_add_epi16(s[2], s[13]); |
|
2294 in[3] = _mm_add_epi16(s[3], s[12]); |
|
2295 in[4] = _mm_add_epi16(s[4], s[11]); |
|
2296 in[5] = _mm_add_epi16(s[5], s[10]); |
|
2297 in[6] = _mm_add_epi16(s[6], s[9]); |
|
2298 in[7] = _mm_add_epi16(s[7], s[8]); |
|
2299 in[8] = _mm_sub_epi16(s[7], s[8]); |
|
2300 in[9] = _mm_sub_epi16(s[6], s[9]); |
|
2301 in[10] = _mm_sub_epi16(s[5], s[10]); |
|
2302 in[11] = _mm_sub_epi16(s[4], s[11]); |
|
2303 in[12] = _mm_sub_epi16(s[3], s[12]); |
|
2304 in[13] = _mm_sub_epi16(s[2], s[13]); |
|
2305 in[14] = _mm_sub_epi16(s[1], s[14]); |
|
2306 in[15] = _mm_sub_epi16(s[0], s[15]); |
|
2307 } |
|
2308 |
|
2309 static void idct16_1d_sse2(__m128i *in0, __m128i *in1) { |
|
2310 array_transpose_16x16(in0, in1); |
|
2311 idct16_1d_8col(in0); |
|
2312 idct16_1d_8col(in1); |
|
2313 } |
|
2314 |
|
2315 static void iadst16_1d_sse2(__m128i *in0, __m128i *in1) { |
|
2316 array_transpose_16x16(in0, in1); |
|
2317 iadst16_1d_8col(in0); |
|
2318 iadst16_1d_8col(in1); |
|
2319 } |
|
2320 |
|
2321 static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in) { |
|
2322 in[0] = _mm_load_si128((const __m128i *)(input + 0 * 16)); |
|
2323 in[1] = _mm_load_si128((const __m128i *)(input + 1 * 16)); |
|
2324 in[2] = _mm_load_si128((const __m128i *)(input + 2 * 16)); |
|
2325 in[3] = _mm_load_si128((const __m128i *)(input + 3 * 16)); |
|
2326 in[4] = _mm_load_si128((const __m128i *)(input + 4 * 16)); |
|
2327 in[5] = _mm_load_si128((const __m128i *)(input + 5 * 16)); |
|
2328 in[6] = _mm_load_si128((const __m128i *)(input + 6 * 16)); |
|
2329 in[7] = _mm_load_si128((const __m128i *)(input + 7 * 16)); |
|
2330 |
|
2331 in[8] = _mm_load_si128((const __m128i *)(input + 8 * 16)); |
|
2332 in[9] = _mm_load_si128((const __m128i *)(input + 9 * 16)); |
|
2333 in[10] = _mm_load_si128((const __m128i *)(input + 10 * 16)); |
|
2334 in[11] = _mm_load_si128((const __m128i *)(input + 11 * 16)); |
|
2335 in[12] = _mm_load_si128((const __m128i *)(input + 12 * 16)); |
|
2336 in[13] = _mm_load_si128((const __m128i *)(input + 13 * 16)); |
|
2337 in[14] = _mm_load_si128((const __m128i *)(input + 14 * 16)); |
|
2338 in[15] = _mm_load_si128((const __m128i *)(input + 15 * 16)); |
|
2339 } |
|
2340 |
|
2341 static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { |
|
2342 const __m128i final_rounding = _mm_set1_epi16(1<<5); |
|
2343 const __m128i zero = _mm_setzero_si128(); |
|
2344 // Final rounding and shift |
|
2345 in[0] = _mm_adds_epi16(in[0], final_rounding); |
|
2346 in[1] = _mm_adds_epi16(in[1], final_rounding); |
|
2347 in[2] = _mm_adds_epi16(in[2], final_rounding); |
|
2348 in[3] = _mm_adds_epi16(in[3], final_rounding); |
|
2349 in[4] = _mm_adds_epi16(in[4], final_rounding); |
|
2350 in[5] = _mm_adds_epi16(in[5], final_rounding); |
|
2351 in[6] = _mm_adds_epi16(in[6], final_rounding); |
|
2352 in[7] = _mm_adds_epi16(in[7], final_rounding); |
|
2353 in[8] = _mm_adds_epi16(in[8], final_rounding); |
|
2354 in[9] = _mm_adds_epi16(in[9], final_rounding); |
|
2355 in[10] = _mm_adds_epi16(in[10], final_rounding); |
|
2356 in[11] = _mm_adds_epi16(in[11], final_rounding); |
|
2357 in[12] = _mm_adds_epi16(in[12], final_rounding); |
|
2358 in[13] = _mm_adds_epi16(in[13], final_rounding); |
|
2359 in[14] = _mm_adds_epi16(in[14], final_rounding); |
|
2360 in[15] = _mm_adds_epi16(in[15], final_rounding); |
|
2361 |
|
2362 in[0] = _mm_srai_epi16(in[0], 6); |
|
2363 in[1] = _mm_srai_epi16(in[1], 6); |
|
2364 in[2] = _mm_srai_epi16(in[2], 6); |
|
2365 in[3] = _mm_srai_epi16(in[3], 6); |
|
2366 in[4] = _mm_srai_epi16(in[4], 6); |
|
2367 in[5] = _mm_srai_epi16(in[5], 6); |
|
2368 in[6] = _mm_srai_epi16(in[6], 6); |
|
2369 in[7] = _mm_srai_epi16(in[7], 6); |
|
2370 in[8] = _mm_srai_epi16(in[8], 6); |
|
2371 in[9] = _mm_srai_epi16(in[9], 6); |
|
2372 in[10] = _mm_srai_epi16(in[10], 6); |
|
2373 in[11] = _mm_srai_epi16(in[11], 6); |
|
2374 in[12] = _mm_srai_epi16(in[12], 6); |
|
2375 in[13] = _mm_srai_epi16(in[13], 6); |
|
2376 in[14] = _mm_srai_epi16(in[14], 6); |
|
2377 in[15] = _mm_srai_epi16(in[15], 6); |
|
2378 |
|
2379 RECON_AND_STORE(dest, in[0]); |
|
2380 RECON_AND_STORE(dest, in[1]); |
|
2381 RECON_AND_STORE(dest, in[2]); |
|
2382 RECON_AND_STORE(dest, in[3]); |
|
2383 RECON_AND_STORE(dest, in[4]); |
|
2384 RECON_AND_STORE(dest, in[5]); |
|
2385 RECON_AND_STORE(dest, in[6]); |
|
2386 RECON_AND_STORE(dest, in[7]); |
|
2387 RECON_AND_STORE(dest, in[8]); |
|
2388 RECON_AND_STORE(dest, in[9]); |
|
2389 RECON_AND_STORE(dest, in[10]); |
|
2390 RECON_AND_STORE(dest, in[11]); |
|
2391 RECON_AND_STORE(dest, in[12]); |
|
2392 RECON_AND_STORE(dest, in[13]); |
|
2393 RECON_AND_STORE(dest, in[14]); |
|
2394 RECON_AND_STORE(dest, in[15]); |
|
2395 } |
|
2396 |
|
2397 void vp9_iht16x16_256_add_sse2(const int16_t *input, uint8_t *dest, int stride, |
|
2398 int tx_type) { |
|
2399 __m128i in0[16], in1[16]; |
|
2400 |
|
2401 load_buffer_8x16(input, in0); |
|
2402 input += 8; |
|
2403 load_buffer_8x16(input, in1); |
|
2404 |
|
2405 switch (tx_type) { |
|
2406 case 0: // DCT_DCT |
|
2407 idct16_1d_sse2(in0, in1); |
|
2408 idct16_1d_sse2(in0, in1); |
|
2409 break; |
|
2410 case 1: // ADST_DCT |
|
2411 idct16_1d_sse2(in0, in1); |
|
2412 iadst16_1d_sse2(in0, in1); |
|
2413 break; |
|
2414 case 2: // DCT_ADST |
|
2415 iadst16_1d_sse2(in0, in1); |
|
2416 idct16_1d_sse2(in0, in1); |
|
2417 break; |
|
2418 case 3: // ADST_ADST |
|
2419 iadst16_1d_sse2(in0, in1); |
|
2420 iadst16_1d_sse2(in0, in1); |
|
2421 break; |
|
2422 default: |
|
2423 assert(0); |
|
2424 break; |
|
2425 } |
|
2426 |
|
2427 write_buffer_8x16(dest, in0, stride); |
|
2428 dest += 8; |
|
2429 write_buffer_8x16(dest, in1, stride); |
|
2430 } |
|
2431 |
|
2432 void vp9_idct16x16_10_add_sse2(const int16_t *input, uint8_t *dest, |
|
2433 int stride) { |
|
2434 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
|
2435 const __m128i final_rounding = _mm_set1_epi16(1<<5); |
|
2436 const __m128i zero = _mm_setzero_si128(); |
|
2437 |
|
2438 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
|
2439 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); |
|
2440 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
|
2441 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); |
|
2442 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
|
2443 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); |
|
2444 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
|
2445 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); |
|
2446 |
|
2447 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
|
2448 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
|
2449 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); |
|
2450 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); |
|
2451 |
|
2452 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
|
2453 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
|
2454 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
|
2455 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
|
2456 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
|
2457 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); |
|
2458 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
|
2459 const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
|
2460 |
|
2461 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
|
2462 |
|
2463 __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero, |
|
2464 in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero, |
|
2465 in10 = zero, in11 = zero, in12 = zero, in13 = zero, |
|
2466 in14 = zero, in15 = zero; |
|
2467 __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero, |
|
2468 l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero, |
|
2469 l12 = zero, l13 = zero, l14 = zero, l15 = zero; |
|
2470 |
|
2471 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, |
|
2472 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, |
|
2473 stp1_8_0, stp1_12_0; |
|
2474 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
|
2475 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15; |
|
2476 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
|
2477 int i; |
|
2478 // 1-D idct. Load input data. |
|
2479 in0 = _mm_load_si128((const __m128i *)input); |
|
2480 in8 = _mm_load_si128((const __m128i *)(input + 8 * 1)); |
|
2481 in1 = _mm_load_si128((const __m128i *)(input + 8 * 2)); |
|
2482 in9 = _mm_load_si128((const __m128i *)(input + 8 * 3)); |
|
2483 in2 = _mm_load_si128((const __m128i *)(input + 8 * 4)); |
|
2484 in10 = _mm_load_si128((const __m128i *)(input + 8 * 5)); |
|
2485 in3 = _mm_load_si128((const __m128i *)(input + 8 * 6)); |
|
2486 in11 = _mm_load_si128((const __m128i *)(input + 8 * 7)); |
|
2487 |
|
2488 TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3); |
|
2489 TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11); |
|
2490 |
|
2491 // Stage2 |
|
2492 { |
|
2493 const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11); |
|
2494 const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3); |
|
2495 const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9); |
|
2496 const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1); |
|
2497 |
|
2498 tmp0 = _mm_madd_epi16(lo_1_15, stg2_0); |
|
2499 tmp2 = _mm_madd_epi16(lo_1_15, stg2_1); |
|
2500 tmp4 = _mm_madd_epi16(lo_9_7, stg2_2); |
|
2501 tmp6 = _mm_madd_epi16(lo_9_7, stg2_3); |
|
2502 tmp1 = _mm_madd_epi16(lo_5_11, stg2_4); |
|
2503 tmp3 = _mm_madd_epi16(lo_5_11, stg2_5); |
|
2504 tmp5 = _mm_madd_epi16(lo_13_3, stg2_6); |
|
2505 tmp7 = _mm_madd_epi16(lo_13_3, stg2_7); |
|
2506 |
|
2507 tmp0 = _mm_add_epi32(tmp0, rounding); |
|
2508 tmp2 = _mm_add_epi32(tmp2, rounding); |
|
2509 tmp4 = _mm_add_epi32(tmp4, rounding); |
|
2510 tmp6 = _mm_add_epi32(tmp6, rounding); |
|
2511 tmp1 = _mm_add_epi32(tmp1, rounding); |
|
2512 tmp3 = _mm_add_epi32(tmp3, rounding); |
|
2513 tmp5 = _mm_add_epi32(tmp5, rounding); |
|
2514 tmp7 = _mm_add_epi32(tmp7, rounding); |
|
2515 |
|
2516 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
|
2517 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
|
2518 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); |
|
2519 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); |
|
2520 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); |
|
2521 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); |
|
2522 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); |
|
2523 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); |
|
2524 |
|
2525 stp2_8 = _mm_packs_epi32(tmp0, zero); |
|
2526 stp2_15 = _mm_packs_epi32(tmp2, zero); |
|
2527 stp2_9 = _mm_packs_epi32(tmp4, zero); |
|
2528 stp2_14 = _mm_packs_epi32(tmp6, zero); |
|
2529 |
|
2530 stp2_10 = _mm_packs_epi32(tmp1, zero); |
|
2531 stp2_13 = _mm_packs_epi32(tmp3, zero); |
|
2532 stp2_11 = _mm_packs_epi32(tmp5, zero); |
|
2533 stp2_12 = _mm_packs_epi32(tmp7, zero); |
|
2534 } |
|
2535 |
|
2536 // Stage3 |
|
2537 { |
|
2538 const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11); |
|
2539 const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3); |
|
2540 |
|
2541 tmp0 = _mm_madd_epi16(lo_2_14, stg3_0); |
|
2542 tmp2 = _mm_madd_epi16(lo_2_14, stg3_1); |
|
2543 tmp4 = _mm_madd_epi16(lo_10_6, stg3_2); |
|
2544 tmp6 = _mm_madd_epi16(lo_10_6, stg3_3); |
|
2545 |
|
2546 tmp0 = _mm_add_epi32(tmp0, rounding); |
|
2547 tmp2 = _mm_add_epi32(tmp2, rounding); |
|
2548 tmp4 = _mm_add_epi32(tmp4, rounding); |
|
2549 tmp6 = _mm_add_epi32(tmp6, rounding); |
|
2550 |
|
2551 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
|
2552 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
|
2553 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); |
|
2554 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); |
|
2555 |
|
2556 stp1_4 = _mm_packs_epi32(tmp0, zero); |
|
2557 stp1_7 = _mm_packs_epi32(tmp2, zero); |
|
2558 stp1_5 = _mm_packs_epi32(tmp4, zero); |
|
2559 stp1_6 = _mm_packs_epi32(tmp6, zero); |
|
2560 |
|
2561 stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); |
|
2562 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); |
|
2563 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); |
|
2564 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); |
|
2565 |
|
2566 stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); |
|
2567 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); |
|
2568 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); |
|
2569 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); |
|
2570 } |
|
2571 |
|
2572 // Stage4 |
|
2573 { |
|
2574 const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); |
|
2575 const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10); |
|
2576 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); |
|
2577 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); |
|
2578 |
|
2579 tmp0 = _mm_madd_epi16(lo_0_8, stg4_0); |
|
2580 tmp2 = _mm_madd_epi16(lo_0_8, stg4_1); |
|
2581 tmp4 = _mm_madd_epi16(lo_4_12, stg4_2); |
|
2582 tmp6 = _mm_madd_epi16(lo_4_12, stg4_3); |
|
2583 tmp1 = _mm_madd_epi16(lo_9_14, stg4_4); |
|
2584 tmp3 = _mm_madd_epi16(lo_9_14, stg4_5); |
|
2585 tmp5 = _mm_madd_epi16(lo_10_13, stg4_6); |
|
2586 tmp7 = _mm_madd_epi16(lo_10_13, stg4_7); |
|
2587 |
|
2588 tmp0 = _mm_add_epi32(tmp0, rounding); |
|
2589 tmp2 = _mm_add_epi32(tmp2, rounding); |
|
2590 tmp4 = _mm_add_epi32(tmp4, rounding); |
|
2591 tmp6 = _mm_add_epi32(tmp6, rounding); |
|
2592 tmp1 = _mm_add_epi32(tmp1, rounding); |
|
2593 tmp3 = _mm_add_epi32(tmp3, rounding); |
|
2594 tmp5 = _mm_add_epi32(tmp5, rounding); |
|
2595 tmp7 = _mm_add_epi32(tmp7, rounding); |
|
2596 |
|
2597 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
|
2598 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
|
2599 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); |
|
2600 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); |
|
2601 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); |
|
2602 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); |
|
2603 tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); |
|
2604 tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); |
|
2605 |
|
2606 stp2_0 = _mm_packs_epi32(tmp0, zero); |
|
2607 stp2_1 = _mm_packs_epi32(tmp2, zero); |
|
2608 stp2_2 = _mm_packs_epi32(tmp4, zero); |
|
2609 stp2_3 = _mm_packs_epi32(tmp6, zero); |
|
2610 stp2_9 = _mm_packs_epi32(tmp1, zero); |
|
2611 stp2_14 = _mm_packs_epi32(tmp3, zero); |
|
2612 stp2_10 = _mm_packs_epi32(tmp5, zero); |
|
2613 stp2_13 = _mm_packs_epi32(tmp7, zero); |
|
2614 |
|
2615 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); |
|
2616 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); |
|
2617 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); |
|
2618 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); |
|
2619 } |
|
2620 |
|
2621 // Stage5 and Stage6 |
|
2622 { |
|
2623 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); |
|
2624 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); |
|
2625 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); |
|
2626 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); |
|
2627 |
|
2628 stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); |
|
2629 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); |
|
2630 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); |
|
2631 stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); |
|
2632 |
|
2633 stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); |
|
2634 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); |
|
2635 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); |
|
2636 stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); |
|
2637 } |
|
2638 |
|
2639 // Stage6 |
|
2640 { |
|
2641 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); |
|
2642 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); |
|
2643 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); |
|
2644 |
|
2645 tmp1 = _mm_madd_epi16(lo_6_5, stg4_1); |
|
2646 tmp3 = _mm_madd_epi16(lo_6_5, stg4_0); |
|
2647 tmp0 = _mm_madd_epi16(lo_10_13, stg6_0); |
|
2648 tmp2 = _mm_madd_epi16(lo_10_13, stg4_0); |
|
2649 tmp4 = _mm_madd_epi16(lo_11_12, stg6_0); |
|
2650 tmp6 = _mm_madd_epi16(lo_11_12, stg4_0); |
|
2651 |
|
2652 tmp1 = _mm_add_epi32(tmp1, rounding); |
|
2653 tmp3 = _mm_add_epi32(tmp3, rounding); |
|
2654 tmp0 = _mm_add_epi32(tmp0, rounding); |
|
2655 tmp2 = _mm_add_epi32(tmp2, rounding); |
|
2656 tmp4 = _mm_add_epi32(tmp4, rounding); |
|
2657 tmp6 = _mm_add_epi32(tmp6, rounding); |
|
2658 |
|
2659 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); |
|
2660 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); |
|
2661 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); |
|
2662 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); |
|
2663 tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); |
|
2664 tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); |
|
2665 |
|
2666 stp1_5 = _mm_packs_epi32(tmp1, zero); |
|
2667 stp1_6 = _mm_packs_epi32(tmp3, zero); |
|
2668 stp2_10 = _mm_packs_epi32(tmp0, zero); |
|
2669 stp2_13 = _mm_packs_epi32(tmp2, zero); |
|
2670 stp2_11 = _mm_packs_epi32(tmp4, zero); |
|
2671 stp2_12 = _mm_packs_epi32(tmp6, zero); |
|
2672 |
|
2673 stp2_0 = _mm_add_epi16(stp1_0, stp2_7); |
|
2674 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); |
|
2675 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); |
|
2676 stp2_3 = _mm_add_epi16(stp1_3, stp2_4); |
|
2677 stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); |
|
2678 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); |
|
2679 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); |
|
2680 stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); |
|
2681 } |
|
2682 |
|
2683 // Stage7. Left 8x16 only. |
|
2684 l0 = _mm_add_epi16(stp2_0, stp1_15); |
|
2685 l1 = _mm_add_epi16(stp2_1, stp1_14); |
|
2686 l2 = _mm_add_epi16(stp2_2, stp2_13); |
|
2687 l3 = _mm_add_epi16(stp2_3, stp2_12); |
|
2688 l4 = _mm_add_epi16(stp2_4, stp2_11); |
|
2689 l5 = _mm_add_epi16(stp2_5, stp2_10); |
|
2690 l6 = _mm_add_epi16(stp2_6, stp1_9); |
|
2691 l7 = _mm_add_epi16(stp2_7, stp1_8); |
|
2692 l8 = _mm_sub_epi16(stp2_7, stp1_8); |
|
2693 l9 = _mm_sub_epi16(stp2_6, stp1_9); |
|
2694 l10 = _mm_sub_epi16(stp2_5, stp2_10); |
|
2695 l11 = _mm_sub_epi16(stp2_4, stp2_11); |
|
2696 l12 = _mm_sub_epi16(stp2_3, stp2_12); |
|
2697 l13 = _mm_sub_epi16(stp2_2, stp2_13); |
|
2698 l14 = _mm_sub_epi16(stp2_1, stp1_14); |
|
2699 l15 = _mm_sub_epi16(stp2_0, stp1_15); |
|
2700 |
|
2701 // 2-D idct. We do 2 8x16 blocks. |
|
2702 for (i = 0; i < 2; i++) { |
|
2703 if (i == 0) |
|
2704 TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4, |
|
2705 in5, in6, in7); |
|
2706 |
|
2707 if (i == 1) |
|
2708 TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3, |
|
2709 in4, in5, in6, in7); |
|
2710 |
|
2711 in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero; |
|
2712 |
|
2713 IDCT16_1D |
|
2714 |
|
2715 // Stage7 |
|
2716 in0 = _mm_add_epi16(stp2_0, stp1_15); |
|
2717 in1 = _mm_add_epi16(stp2_1, stp1_14); |
|
2718 in2 = _mm_add_epi16(stp2_2, stp2_13); |
|
2719 in3 = _mm_add_epi16(stp2_3, stp2_12); |
|
2720 in4 = _mm_add_epi16(stp2_4, stp2_11); |
|
2721 in5 = _mm_add_epi16(stp2_5, stp2_10); |
|
2722 in6 = _mm_add_epi16(stp2_6, stp1_9); |
|
2723 in7 = _mm_add_epi16(stp2_7, stp1_8); |
|
2724 in8 = _mm_sub_epi16(stp2_7, stp1_8); |
|
2725 in9 = _mm_sub_epi16(stp2_6, stp1_9); |
|
2726 in10 = _mm_sub_epi16(stp2_5, stp2_10); |
|
2727 in11 = _mm_sub_epi16(stp2_4, stp2_11); |
|
2728 in12 = _mm_sub_epi16(stp2_3, stp2_12); |
|
2729 in13 = _mm_sub_epi16(stp2_2, stp2_13); |
|
2730 in14 = _mm_sub_epi16(stp2_1, stp1_14); |
|
2731 in15 = _mm_sub_epi16(stp2_0, stp1_15); |
|
2732 |
|
2733 // Final rounding and shift |
|
2734 in0 = _mm_adds_epi16(in0, final_rounding); |
|
2735 in1 = _mm_adds_epi16(in1, final_rounding); |
|
2736 in2 = _mm_adds_epi16(in2, final_rounding); |
|
2737 in3 = _mm_adds_epi16(in3, final_rounding); |
|
2738 in4 = _mm_adds_epi16(in4, final_rounding); |
|
2739 in5 = _mm_adds_epi16(in5, final_rounding); |
|
2740 in6 = _mm_adds_epi16(in6, final_rounding); |
|
2741 in7 = _mm_adds_epi16(in7, final_rounding); |
|
2742 in8 = _mm_adds_epi16(in8, final_rounding); |
|
2743 in9 = _mm_adds_epi16(in9, final_rounding); |
|
2744 in10 = _mm_adds_epi16(in10, final_rounding); |
|
2745 in11 = _mm_adds_epi16(in11, final_rounding); |
|
2746 in12 = _mm_adds_epi16(in12, final_rounding); |
|
2747 in13 = _mm_adds_epi16(in13, final_rounding); |
|
2748 in14 = _mm_adds_epi16(in14, final_rounding); |
|
2749 in15 = _mm_adds_epi16(in15, final_rounding); |
|
2750 |
|
2751 in0 = _mm_srai_epi16(in0, 6); |
|
2752 in1 = _mm_srai_epi16(in1, 6); |
|
2753 in2 = _mm_srai_epi16(in2, 6); |
|
2754 in3 = _mm_srai_epi16(in3, 6); |
|
2755 in4 = _mm_srai_epi16(in4, 6); |
|
2756 in5 = _mm_srai_epi16(in5, 6); |
|
2757 in6 = _mm_srai_epi16(in6, 6); |
|
2758 in7 = _mm_srai_epi16(in7, 6); |
|
2759 in8 = _mm_srai_epi16(in8, 6); |
|
2760 in9 = _mm_srai_epi16(in9, 6); |
|
2761 in10 = _mm_srai_epi16(in10, 6); |
|
2762 in11 = _mm_srai_epi16(in11, 6); |
|
2763 in12 = _mm_srai_epi16(in12, 6); |
|
2764 in13 = _mm_srai_epi16(in13, 6); |
|
2765 in14 = _mm_srai_epi16(in14, 6); |
|
2766 in15 = _mm_srai_epi16(in15, 6); |
|
2767 |
|
2768 RECON_AND_STORE(dest, in0); |
|
2769 RECON_AND_STORE(dest, in1); |
|
2770 RECON_AND_STORE(dest, in2); |
|
2771 RECON_AND_STORE(dest, in3); |
|
2772 RECON_AND_STORE(dest, in4); |
|
2773 RECON_AND_STORE(dest, in5); |
|
2774 RECON_AND_STORE(dest, in6); |
|
2775 RECON_AND_STORE(dest, in7); |
|
2776 RECON_AND_STORE(dest, in8); |
|
2777 RECON_AND_STORE(dest, in9); |
|
2778 RECON_AND_STORE(dest, in10); |
|
2779 RECON_AND_STORE(dest, in11); |
|
2780 RECON_AND_STORE(dest, in12); |
|
2781 RECON_AND_STORE(dest, in13); |
|
2782 RECON_AND_STORE(dest, in14); |
|
2783 RECON_AND_STORE(dest, in15); |
|
2784 |
|
2785 dest += 8 - (stride * 16); |
|
2786 } |
|
2787 } |
|
2788 |
|
2789 #define LOAD_DQCOEFF(reg, input) \ |
|
2790 { \ |
|
2791 reg = _mm_load_si128((const __m128i *) input); \ |
|
2792 input += 8; \ |
|
2793 } \ |
|
2794 |
|
2795 #define IDCT32_1D \ |
|
2796 /* Stage1 */ \ |
|
2797 { \ |
|
2798 const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31); \ |
|
2799 const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31); \ |
|
2800 const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15); \ |
|
2801 const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15); \ |
|
2802 \ |
|
2803 const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23); \ |
|
2804 const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23); \ |
|
2805 const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7); \ |
|
2806 const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7); \ |
|
2807 \ |
|
2808 const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27); \ |
|
2809 const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27); \ |
|
2810 const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11); \ |
|
2811 const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11); \ |
|
2812 \ |
|
2813 const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19); \ |
|
2814 const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19); \ |
|
2815 const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3); \ |
|
2816 const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3); \ |
|
2817 \ |
|
2818 MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, \ |
|
2819 stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, \ |
|
2820 stp1_17, stp1_30) \ |
|
2821 MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, \ |
|
2822 stg1_5, stg1_6, stg1_7, stp1_18, stp1_29, \ |
|
2823 stp1_19, stp1_28) \ |
|
2824 MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, \ |
|
2825 stg1_9, stg1_10, stg1_11, stp1_20, stp1_27, \ |
|
2826 stp1_21, stp1_26) \ |
|
2827 MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, \ |
|
2828 stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, \ |
|
2829 stp1_23, stp1_24) \ |
|
2830 } \ |
|
2831 \ |
|
2832 /* Stage2 */ \ |
|
2833 { \ |
|
2834 const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30); \ |
|
2835 const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30); \ |
|
2836 const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14); \ |
|
2837 const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14); \ |
|
2838 \ |
|
2839 const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22); \ |
|
2840 const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22); \ |
|
2841 const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6); \ |
|
2842 const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6); \ |
|
2843 \ |
|
2844 MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, \ |
|
2845 stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, \ |
|
2846 stp2_14) \ |
|
2847 MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, \ |
|
2848 stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, \ |
|
2849 stp2_11, stp2_12) \ |
|
2850 \ |
|
2851 stp2_16 = _mm_add_epi16(stp1_16, stp1_17); \ |
|
2852 stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); \ |
|
2853 stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); \ |
|
2854 stp2_19 = _mm_add_epi16(stp1_19, stp1_18); \ |
|
2855 \ |
|
2856 stp2_20 = _mm_add_epi16(stp1_20, stp1_21); \ |
|
2857 stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); \ |
|
2858 stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); \ |
|
2859 stp2_23 = _mm_add_epi16(stp1_23, stp1_22); \ |
|
2860 \ |
|
2861 stp2_24 = _mm_add_epi16(stp1_24, stp1_25); \ |
|
2862 stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); \ |
|
2863 stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); \ |
|
2864 stp2_27 = _mm_add_epi16(stp1_27, stp1_26); \ |
|
2865 \ |
|
2866 stp2_28 = _mm_add_epi16(stp1_28, stp1_29); \ |
|
2867 stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); \ |
|
2868 stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); \ |
|
2869 stp2_31 = _mm_add_epi16(stp1_31, stp1_30); \ |
|
2870 } \ |
|
2871 \ |
|
2872 /* Stage3 */ \ |
|
2873 { \ |
|
2874 const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28); \ |
|
2875 const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28); \ |
|
2876 const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12); \ |
|
2877 const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12); \ |
|
2878 \ |
|
2879 const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); \ |
|
2880 const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); \ |
|
2881 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ |
|
2882 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ |
|
2883 \ |
|
2884 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ |
|
2885 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ |
|
2886 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ |
|
2887 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ |
|
2888 \ |
|
2889 MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, \ |
|
2890 stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, \ |
|
2891 stp1_6) \ |
|
2892 \ |
|
2893 stp1_8 = _mm_add_epi16(stp2_8, stp2_9); \ |
|
2894 stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \ |
|
2895 stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \ |
|
2896 stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \ |
|
2897 stp1_12 = _mm_add_epi16(stp2_12, stp2_13); \ |
|
2898 stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \ |
|
2899 stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \ |
|
2900 stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \ |
|
2901 \ |
|
2902 MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, \ |
|
2903 stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, \ |
|
2904 stp1_18, stp1_29) \ |
|
2905 MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, \ |
|
2906 stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, \ |
|
2907 stp1_22, stp1_25) \ |
|
2908 \ |
|
2909 stp1_16 = stp2_16; \ |
|
2910 stp1_31 = stp2_31; \ |
|
2911 stp1_19 = stp2_19; \ |
|
2912 stp1_20 = stp2_20; \ |
|
2913 stp1_23 = stp2_23; \ |
|
2914 stp1_24 = stp2_24; \ |
|
2915 stp1_27 = stp2_27; \ |
|
2916 stp1_28 = stp2_28; \ |
|
2917 } \ |
|
2918 \ |
|
2919 /* Stage4 */ \ |
|
2920 { \ |
|
2921 const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16); \ |
|
2922 const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16); \ |
|
2923 const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24); \ |
|
2924 const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24); \ |
|
2925 \ |
|
2926 const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \ |
|
2927 const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \ |
|
2928 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ |
|
2929 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ |
|
2930 \ |
|
2931 MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, \ |
|
2932 stg4_1, stg4_2, stg4_3, stp2_0, stp2_1, \ |
|
2933 stp2_2, stp2_3) \ |
|
2934 \ |
|
2935 stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ |
|
2936 stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ |
|
2937 stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ |
|
2938 stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ |
|
2939 \ |
|
2940 MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, \ |
|
2941 stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, \ |
|
2942 stp2_10, stp2_13) \ |
|
2943 \ |
|
2944 stp2_8 = stp1_8; \ |
|
2945 stp2_15 = stp1_15; \ |
|
2946 stp2_11 = stp1_11; \ |
|
2947 stp2_12 = stp1_12; \ |
|
2948 \ |
|
2949 stp2_16 = _mm_add_epi16(stp1_16, stp1_19); \ |
|
2950 stp2_17 = _mm_add_epi16(stp1_17, stp1_18); \ |
|
2951 stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); \ |
|
2952 stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); \ |
|
2953 stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); \ |
|
2954 stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); \ |
|
2955 stp2_22 = _mm_add_epi16(stp1_22, stp1_21); \ |
|
2956 stp2_23 = _mm_add_epi16(stp1_23, stp1_20); \ |
|
2957 \ |
|
2958 stp2_24 = _mm_add_epi16(stp1_24, stp1_27); \ |
|
2959 stp2_25 = _mm_add_epi16(stp1_25, stp1_26); \ |
|
2960 stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); \ |
|
2961 stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); \ |
|
2962 stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); \ |
|
2963 stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); \ |
|
2964 stp2_30 = _mm_add_epi16(stp1_29, stp1_30); \ |
|
2965 stp2_31 = _mm_add_epi16(stp1_28, stp1_31); \ |
|
2966 } \ |
|
2967 \ |
|
2968 /* Stage5 */ \ |
|
2969 { \ |
|
2970 const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ |
|
2971 const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ |
|
2972 const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); \ |
|
2973 const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); \ |
|
2974 \ |
|
2975 const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); \ |
|
2976 const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); \ |
|
2977 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ |
|
2978 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ |
|
2979 \ |
|
2980 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ |
|
2981 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ |
|
2982 \ |
|
2983 stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ |
|
2984 stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ |
|
2985 stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ |
|
2986 stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ |
|
2987 \ |
|
2988 tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \ |
|
2989 tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \ |
|
2990 tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \ |
|
2991 tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \ |
|
2992 \ |
|
2993 tmp0 = _mm_add_epi32(tmp0, rounding); \ |
|
2994 tmp1 = _mm_add_epi32(tmp1, rounding); \ |
|
2995 tmp2 = _mm_add_epi32(tmp2, rounding); \ |
|
2996 tmp3 = _mm_add_epi32(tmp3, rounding); \ |
|
2997 \ |
|
2998 tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ |
|
2999 tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ |
|
3000 tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ |
|
3001 tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ |
|
3002 \ |
|
3003 stp1_5 = _mm_packs_epi32(tmp0, tmp1); \ |
|
3004 stp1_6 = _mm_packs_epi32(tmp2, tmp3); \ |
|
3005 \ |
|
3006 stp1_4 = stp2_4; \ |
|
3007 stp1_7 = stp2_7; \ |
|
3008 \ |
|
3009 stp1_8 = _mm_add_epi16(stp2_8, stp2_11); \ |
|
3010 stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \ |
|
3011 stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \ |
|
3012 stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); \ |
|
3013 stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); \ |
|
3014 stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \ |
|
3015 stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \ |
|
3016 stp1_15 = _mm_add_epi16(stp2_15, stp2_12); \ |
|
3017 \ |
|
3018 stp1_16 = stp2_16; \ |
|
3019 stp1_17 = stp2_17; \ |
|
3020 \ |
|
3021 MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, \ |
|
3022 stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, \ |
|
3023 stp1_19, stp1_28) \ |
|
3024 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, \ |
|
3025 stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, \ |
|
3026 stp1_21, stp1_26) \ |
|
3027 \ |
|
3028 stp1_22 = stp2_22; \ |
|
3029 stp1_23 = stp2_23; \ |
|
3030 stp1_24 = stp2_24; \ |
|
3031 stp1_25 = stp2_25; \ |
|
3032 stp1_30 = stp2_30; \ |
|
3033 stp1_31 = stp2_31; \ |
|
3034 } \ |
|
3035 \ |
|
3036 /* Stage6 */ \ |
|
3037 { \ |
|
3038 const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \ |
|
3039 const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \ |
|
3040 const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \ |
|
3041 const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \ |
|
3042 \ |
|
3043 stp2_0 = _mm_add_epi16(stp1_0, stp1_7); \ |
|
3044 stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \ |
|
3045 stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \ |
|
3046 stp2_3 = _mm_add_epi16(stp1_3, stp1_4); \ |
|
3047 stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); \ |
|
3048 stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \ |
|
3049 stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \ |
|
3050 stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); \ |
|
3051 \ |
|
3052 stp2_8 = stp1_8; \ |
|
3053 stp2_9 = stp1_9; \ |
|
3054 stp2_14 = stp1_14; \ |
|
3055 stp2_15 = stp1_15; \ |
|
3056 \ |
|
3057 MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \ |
|
3058 stg6_0, stg4_0, stg6_0, stg4_0, stp2_10, \ |
|
3059 stp2_13, stp2_11, stp2_12) \ |
|
3060 \ |
|
3061 stp2_16 = _mm_add_epi16(stp1_16, stp1_23); \ |
|
3062 stp2_17 = _mm_add_epi16(stp1_17, stp1_22); \ |
|
3063 stp2_18 = _mm_add_epi16(stp1_18, stp1_21); \ |
|
3064 stp2_19 = _mm_add_epi16(stp1_19, stp1_20); \ |
|
3065 stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); \ |
|
3066 stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); \ |
|
3067 stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); \ |
|
3068 stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); \ |
|
3069 \ |
|
3070 stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); \ |
|
3071 stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); \ |
|
3072 stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); \ |
|
3073 stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); \ |
|
3074 stp2_28 = _mm_add_epi16(stp1_27, stp1_28); \ |
|
3075 stp2_29 = _mm_add_epi16(stp1_26, stp1_29); \ |
|
3076 stp2_30 = _mm_add_epi16(stp1_25, stp1_30); \ |
|
3077 stp2_31 = _mm_add_epi16(stp1_24, stp1_31); \ |
|
3078 } \ |
|
3079 \ |
|
3080 /* Stage7 */ \ |
|
3081 { \ |
|
3082 const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); \ |
|
3083 const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); \ |
|
3084 const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); \ |
|
3085 const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); \ |
|
3086 \ |
|
3087 const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); \ |
|
3088 const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); \ |
|
3089 const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); \ |
|
3090 const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); \ |
|
3091 \ |
|
3092 stp1_0 = _mm_add_epi16(stp2_0, stp2_15); \ |
|
3093 stp1_1 = _mm_add_epi16(stp2_1, stp2_14); \ |
|
3094 stp1_2 = _mm_add_epi16(stp2_2, stp2_13); \ |
|
3095 stp1_3 = _mm_add_epi16(stp2_3, stp2_12); \ |
|
3096 stp1_4 = _mm_add_epi16(stp2_4, stp2_11); \ |
|
3097 stp1_5 = _mm_add_epi16(stp2_5, stp2_10); \ |
|
3098 stp1_6 = _mm_add_epi16(stp2_6, stp2_9); \ |
|
3099 stp1_7 = _mm_add_epi16(stp2_7, stp2_8); \ |
|
3100 stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); \ |
|
3101 stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); \ |
|
3102 stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); \ |
|
3103 stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); \ |
|
3104 stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); \ |
|
3105 stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); \ |
|
3106 stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); \ |
|
3107 stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); \ |
|
3108 \ |
|
3109 stp1_16 = stp2_16; \ |
|
3110 stp1_17 = stp2_17; \ |
|
3111 stp1_18 = stp2_18; \ |
|
3112 stp1_19 = stp2_19; \ |
|
3113 \ |
|
3114 MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, \ |
|
3115 stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, \ |
|
3116 stp1_21, stp1_26) \ |
|
3117 MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, \ |
|
3118 stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, \ |
|
3119 stp1_23, stp1_24) \ |
|
3120 \ |
|
3121 stp1_28 = stp2_28; \ |
|
3122 stp1_29 = stp2_29; \ |
|
3123 stp1_30 = stp2_30; \ |
|
3124 stp1_31 = stp2_31; \ |
|
3125 } |
|
3126 |
|
3127 // Only upper-left 8x8 has non-zero coeff |
|
3128 void vp9_idct32x32_34_add_sse2(const int16_t *input, uint8_t *dest, |
|
3129 int stride) { |
|
3130 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
|
3131 const __m128i final_rounding = _mm_set1_epi16(1<<5); |
|
3132 |
|
3133 // idct constants for each stage |
|
3134 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
|
3135 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); |
|
3136 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); |
|
3137 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); |
|
3138 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); |
|
3139 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); |
|
3140 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); |
|
3141 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); |
|
3142 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); |
|
3143 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); |
|
3144 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); |
|
3145 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); |
|
3146 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); |
|
3147 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); |
|
3148 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); |
|
3149 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); |
|
3150 |
|
3151 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
|
3152 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); |
|
3153 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
|
3154 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); |
|
3155 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
|
3156 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); |
|
3157 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
|
3158 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); |
|
3159 |
|
3160 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
|
3161 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
|
3162 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); |
|
3163 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); |
|
3164 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
|
3165 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); |
|
3166 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); |
|
3167 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
|
3168 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); |
|
3169 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); |
|
3170 |
|
3171 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
|
3172 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
|
3173 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
|
3174 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
|
3175 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
|
3176 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); |
|
3177 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
|
3178 |
|
3179 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
|
3180 |
|
3181 __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, |
|
3182 in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, |
|
3183 in24, in25, in26, in27, in28, in29, in30, in31; |
|
3184 __m128i col[128]; |
|
3185 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, |
|
3186 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, |
|
3187 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, |
|
3188 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, |
|
3189 stp1_30, stp1_31; |
|
3190 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
|
3191 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, |
|
3192 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, |
|
3193 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, |
|
3194 stp2_30, stp2_31; |
|
3195 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
|
3196 int i, j, i32; |
|
3197 |
|
3198 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. |
|
3199 for (i = 0; i < 8; i++) { |
|
3200 i32 = (i << 5); |
|
3201 if (i == 0) { |
|
3202 // First 1-D idct: first 8 rows |
|
3203 // Load input data. |
|
3204 LOAD_DQCOEFF(in0, input); |
|
3205 LOAD_DQCOEFF(in8, input); |
|
3206 LOAD_DQCOEFF(in16, input); |
|
3207 LOAD_DQCOEFF(in24, input); |
|
3208 LOAD_DQCOEFF(in1, input); |
|
3209 LOAD_DQCOEFF(in9, input); |
|
3210 LOAD_DQCOEFF(in17, input); |
|
3211 LOAD_DQCOEFF(in25, input); |
|
3212 LOAD_DQCOEFF(in2, input); |
|
3213 LOAD_DQCOEFF(in10, input); |
|
3214 LOAD_DQCOEFF(in18, input); |
|
3215 LOAD_DQCOEFF(in26, input); |
|
3216 LOAD_DQCOEFF(in3, input); |
|
3217 LOAD_DQCOEFF(in11, input); |
|
3218 LOAD_DQCOEFF(in19, input); |
|
3219 LOAD_DQCOEFF(in27, input); |
|
3220 |
|
3221 LOAD_DQCOEFF(in4, input); |
|
3222 LOAD_DQCOEFF(in12, input); |
|
3223 LOAD_DQCOEFF(in20, input); |
|
3224 LOAD_DQCOEFF(in28, input); |
|
3225 LOAD_DQCOEFF(in5, input); |
|
3226 LOAD_DQCOEFF(in13, input); |
|
3227 LOAD_DQCOEFF(in21, input); |
|
3228 LOAD_DQCOEFF(in29, input); |
|
3229 LOAD_DQCOEFF(in6, input); |
|
3230 LOAD_DQCOEFF(in14, input); |
|
3231 LOAD_DQCOEFF(in22, input); |
|
3232 LOAD_DQCOEFF(in30, input); |
|
3233 LOAD_DQCOEFF(in7, input); |
|
3234 LOAD_DQCOEFF(in15, input); |
|
3235 LOAD_DQCOEFF(in23, input); |
|
3236 LOAD_DQCOEFF(in31, input); |
|
3237 |
|
3238 // Transpose 32x8 block to 8x32 block |
|
3239 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, |
|
3240 in4, in5, in6, in7); |
|
3241 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, |
|
3242 in10, in11, in12, in13, in14, in15); |
|
3243 TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, |
|
3244 in18, in19, in20, in21, in22, in23); |
|
3245 TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, |
|
3246 in26, in27, in28, in29, in30, in31); |
|
3247 } else if (i < 4) { |
|
3248 // First 1-D idct: next 24 zero-coeff rows |
|
3249 col[i32 + 0] = _mm_setzero_si128(); |
|
3250 col[i32 + 1] = _mm_setzero_si128(); |
|
3251 col[i32 + 2] = _mm_setzero_si128(); |
|
3252 col[i32 + 3] = _mm_setzero_si128(); |
|
3253 col[i32 + 4] = _mm_setzero_si128(); |
|
3254 col[i32 + 5] = _mm_setzero_si128(); |
|
3255 col[i32 + 6] = _mm_setzero_si128(); |
|
3256 col[i32 + 7] = _mm_setzero_si128(); |
|
3257 col[i32 + 8] = _mm_setzero_si128(); |
|
3258 col[i32 + 9] = _mm_setzero_si128(); |
|
3259 col[i32 + 10] = _mm_setzero_si128(); |
|
3260 col[i32 + 11] = _mm_setzero_si128(); |
|
3261 col[i32 + 12] = _mm_setzero_si128(); |
|
3262 col[i32 + 13] = _mm_setzero_si128(); |
|
3263 col[i32 + 14] = _mm_setzero_si128(); |
|
3264 col[i32 + 15] = _mm_setzero_si128(); |
|
3265 col[i32 + 16] = _mm_setzero_si128(); |
|
3266 col[i32 + 17] = _mm_setzero_si128(); |
|
3267 col[i32 + 18] = _mm_setzero_si128(); |
|
3268 col[i32 + 19] = _mm_setzero_si128(); |
|
3269 col[i32 + 20] = _mm_setzero_si128(); |
|
3270 col[i32 + 21] = _mm_setzero_si128(); |
|
3271 col[i32 + 22] = _mm_setzero_si128(); |
|
3272 col[i32 + 23] = _mm_setzero_si128(); |
|
3273 col[i32 + 24] = _mm_setzero_si128(); |
|
3274 col[i32 + 25] = _mm_setzero_si128(); |
|
3275 col[i32 + 26] = _mm_setzero_si128(); |
|
3276 col[i32 + 27] = _mm_setzero_si128(); |
|
3277 col[i32 + 28] = _mm_setzero_si128(); |
|
3278 col[i32 + 29] = _mm_setzero_si128(); |
|
3279 col[i32 + 30] = _mm_setzero_si128(); |
|
3280 col[i32 + 31] = _mm_setzero_si128(); |
|
3281 continue; |
|
3282 } else { |
|
3283 // Second 1-D idct |
|
3284 j = i - 4; |
|
3285 |
|
3286 // Transpose 32x8 block to 8x32 block |
|
3287 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
|
3288 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
|
3289 col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, |
|
3290 in5, in6, in7); |
|
3291 j += 4; |
|
3292 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
|
3293 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
|
3294 col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, |
|
3295 in11, in12, in13, in14, in15); |
|
3296 j += 4; |
|
3297 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
|
3298 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
|
3299 col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, |
|
3300 in19, in20, in21, in22, in23); |
|
3301 j += 4; |
|
3302 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
|
3303 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
|
3304 col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, |
|
3305 in28, in29, in30, in31); |
|
3306 } |
|
3307 |
|
3308 IDCT32_1D |
|
3309 |
|
3310 // final stage |
|
3311 if (i < 4) { |
|
3312 // 1_D: Store 32 intermediate results for each 8x32 block. |
|
3313 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); |
|
3314 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); |
|
3315 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); |
|
3316 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); |
|
3317 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); |
|
3318 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); |
|
3319 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); |
|
3320 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); |
|
3321 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); |
|
3322 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); |
|
3323 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); |
|
3324 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); |
|
3325 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); |
|
3326 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); |
|
3327 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); |
|
3328 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); |
|
3329 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); |
|
3330 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); |
|
3331 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); |
|
3332 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); |
|
3333 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); |
|
3334 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); |
|
3335 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); |
|
3336 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); |
|
3337 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); |
|
3338 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); |
|
3339 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); |
|
3340 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); |
|
3341 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); |
|
3342 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); |
|
3343 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); |
|
3344 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); |
|
3345 } else { |
|
3346 const __m128i zero = _mm_setzero_si128(); |
|
3347 |
|
3348 // 2_D: Calculate the results and store them to destination. |
|
3349 in0 = _mm_add_epi16(stp1_0, stp1_31); |
|
3350 in1 = _mm_add_epi16(stp1_1, stp1_30); |
|
3351 in2 = _mm_add_epi16(stp1_2, stp1_29); |
|
3352 in3 = _mm_add_epi16(stp1_3, stp1_28); |
|
3353 in4 = _mm_add_epi16(stp1_4, stp1_27); |
|
3354 in5 = _mm_add_epi16(stp1_5, stp1_26); |
|
3355 in6 = _mm_add_epi16(stp1_6, stp1_25); |
|
3356 in7 = _mm_add_epi16(stp1_7, stp1_24); |
|
3357 in8 = _mm_add_epi16(stp1_8, stp1_23); |
|
3358 in9 = _mm_add_epi16(stp1_9, stp1_22); |
|
3359 in10 = _mm_add_epi16(stp1_10, stp1_21); |
|
3360 in11 = _mm_add_epi16(stp1_11, stp1_20); |
|
3361 in12 = _mm_add_epi16(stp1_12, stp1_19); |
|
3362 in13 = _mm_add_epi16(stp1_13, stp1_18); |
|
3363 in14 = _mm_add_epi16(stp1_14, stp1_17); |
|
3364 in15 = _mm_add_epi16(stp1_15, stp1_16); |
|
3365 in16 = _mm_sub_epi16(stp1_15, stp1_16); |
|
3366 in17 = _mm_sub_epi16(stp1_14, stp1_17); |
|
3367 in18 = _mm_sub_epi16(stp1_13, stp1_18); |
|
3368 in19 = _mm_sub_epi16(stp1_12, stp1_19); |
|
3369 in20 = _mm_sub_epi16(stp1_11, stp1_20); |
|
3370 in21 = _mm_sub_epi16(stp1_10, stp1_21); |
|
3371 in22 = _mm_sub_epi16(stp1_9, stp1_22); |
|
3372 in23 = _mm_sub_epi16(stp1_8, stp1_23); |
|
3373 in24 = _mm_sub_epi16(stp1_7, stp1_24); |
|
3374 in25 = _mm_sub_epi16(stp1_6, stp1_25); |
|
3375 in26 = _mm_sub_epi16(stp1_5, stp1_26); |
|
3376 in27 = _mm_sub_epi16(stp1_4, stp1_27); |
|
3377 in28 = _mm_sub_epi16(stp1_3, stp1_28); |
|
3378 in29 = _mm_sub_epi16(stp1_2, stp1_29); |
|
3379 in30 = _mm_sub_epi16(stp1_1, stp1_30); |
|
3380 in31 = _mm_sub_epi16(stp1_0, stp1_31); |
|
3381 |
|
3382 // Final rounding and shift |
|
3383 in0 = _mm_adds_epi16(in0, final_rounding); |
|
3384 in1 = _mm_adds_epi16(in1, final_rounding); |
|
3385 in2 = _mm_adds_epi16(in2, final_rounding); |
|
3386 in3 = _mm_adds_epi16(in3, final_rounding); |
|
3387 in4 = _mm_adds_epi16(in4, final_rounding); |
|
3388 in5 = _mm_adds_epi16(in5, final_rounding); |
|
3389 in6 = _mm_adds_epi16(in6, final_rounding); |
|
3390 in7 = _mm_adds_epi16(in7, final_rounding); |
|
3391 in8 = _mm_adds_epi16(in8, final_rounding); |
|
3392 in9 = _mm_adds_epi16(in9, final_rounding); |
|
3393 in10 = _mm_adds_epi16(in10, final_rounding); |
|
3394 in11 = _mm_adds_epi16(in11, final_rounding); |
|
3395 in12 = _mm_adds_epi16(in12, final_rounding); |
|
3396 in13 = _mm_adds_epi16(in13, final_rounding); |
|
3397 in14 = _mm_adds_epi16(in14, final_rounding); |
|
3398 in15 = _mm_adds_epi16(in15, final_rounding); |
|
3399 in16 = _mm_adds_epi16(in16, final_rounding); |
|
3400 in17 = _mm_adds_epi16(in17, final_rounding); |
|
3401 in18 = _mm_adds_epi16(in18, final_rounding); |
|
3402 in19 = _mm_adds_epi16(in19, final_rounding); |
|
3403 in20 = _mm_adds_epi16(in20, final_rounding); |
|
3404 in21 = _mm_adds_epi16(in21, final_rounding); |
|
3405 in22 = _mm_adds_epi16(in22, final_rounding); |
|
3406 in23 = _mm_adds_epi16(in23, final_rounding); |
|
3407 in24 = _mm_adds_epi16(in24, final_rounding); |
|
3408 in25 = _mm_adds_epi16(in25, final_rounding); |
|
3409 in26 = _mm_adds_epi16(in26, final_rounding); |
|
3410 in27 = _mm_adds_epi16(in27, final_rounding); |
|
3411 in28 = _mm_adds_epi16(in28, final_rounding); |
|
3412 in29 = _mm_adds_epi16(in29, final_rounding); |
|
3413 in30 = _mm_adds_epi16(in30, final_rounding); |
|
3414 in31 = _mm_adds_epi16(in31, final_rounding); |
|
3415 |
|
3416 in0 = _mm_srai_epi16(in0, 6); |
|
3417 in1 = _mm_srai_epi16(in1, 6); |
|
3418 in2 = _mm_srai_epi16(in2, 6); |
|
3419 in3 = _mm_srai_epi16(in3, 6); |
|
3420 in4 = _mm_srai_epi16(in4, 6); |
|
3421 in5 = _mm_srai_epi16(in5, 6); |
|
3422 in6 = _mm_srai_epi16(in6, 6); |
|
3423 in7 = _mm_srai_epi16(in7, 6); |
|
3424 in8 = _mm_srai_epi16(in8, 6); |
|
3425 in9 = _mm_srai_epi16(in9, 6); |
|
3426 in10 = _mm_srai_epi16(in10, 6); |
|
3427 in11 = _mm_srai_epi16(in11, 6); |
|
3428 in12 = _mm_srai_epi16(in12, 6); |
|
3429 in13 = _mm_srai_epi16(in13, 6); |
|
3430 in14 = _mm_srai_epi16(in14, 6); |
|
3431 in15 = _mm_srai_epi16(in15, 6); |
|
3432 in16 = _mm_srai_epi16(in16, 6); |
|
3433 in17 = _mm_srai_epi16(in17, 6); |
|
3434 in18 = _mm_srai_epi16(in18, 6); |
|
3435 in19 = _mm_srai_epi16(in19, 6); |
|
3436 in20 = _mm_srai_epi16(in20, 6); |
|
3437 in21 = _mm_srai_epi16(in21, 6); |
|
3438 in22 = _mm_srai_epi16(in22, 6); |
|
3439 in23 = _mm_srai_epi16(in23, 6); |
|
3440 in24 = _mm_srai_epi16(in24, 6); |
|
3441 in25 = _mm_srai_epi16(in25, 6); |
|
3442 in26 = _mm_srai_epi16(in26, 6); |
|
3443 in27 = _mm_srai_epi16(in27, 6); |
|
3444 in28 = _mm_srai_epi16(in28, 6); |
|
3445 in29 = _mm_srai_epi16(in29, 6); |
|
3446 in30 = _mm_srai_epi16(in30, 6); |
|
3447 in31 = _mm_srai_epi16(in31, 6); |
|
3448 |
|
3449 RECON_AND_STORE(dest, in0); |
|
3450 RECON_AND_STORE(dest, in1); |
|
3451 RECON_AND_STORE(dest, in2); |
|
3452 RECON_AND_STORE(dest, in3); |
|
3453 RECON_AND_STORE(dest, in4); |
|
3454 RECON_AND_STORE(dest, in5); |
|
3455 RECON_AND_STORE(dest, in6); |
|
3456 RECON_AND_STORE(dest, in7); |
|
3457 RECON_AND_STORE(dest, in8); |
|
3458 RECON_AND_STORE(dest, in9); |
|
3459 RECON_AND_STORE(dest, in10); |
|
3460 RECON_AND_STORE(dest, in11); |
|
3461 RECON_AND_STORE(dest, in12); |
|
3462 RECON_AND_STORE(dest, in13); |
|
3463 RECON_AND_STORE(dest, in14); |
|
3464 RECON_AND_STORE(dest, in15); |
|
3465 RECON_AND_STORE(dest, in16); |
|
3466 RECON_AND_STORE(dest, in17); |
|
3467 RECON_AND_STORE(dest, in18); |
|
3468 RECON_AND_STORE(dest, in19); |
|
3469 RECON_AND_STORE(dest, in20); |
|
3470 RECON_AND_STORE(dest, in21); |
|
3471 RECON_AND_STORE(dest, in22); |
|
3472 RECON_AND_STORE(dest, in23); |
|
3473 RECON_AND_STORE(dest, in24); |
|
3474 RECON_AND_STORE(dest, in25); |
|
3475 RECON_AND_STORE(dest, in26); |
|
3476 RECON_AND_STORE(dest, in27); |
|
3477 RECON_AND_STORE(dest, in28); |
|
3478 RECON_AND_STORE(dest, in29); |
|
3479 RECON_AND_STORE(dest, in30); |
|
3480 RECON_AND_STORE(dest, in31); |
|
3481 |
|
3482 dest += 8 - (stride * 32); |
|
3483 } |
|
3484 } |
|
3485 } |
|
3486 |
|
3487 void vp9_idct32x32_1024_add_sse2(const int16_t *input, uint8_t *dest, |
|
3488 int stride) { |
|
3489 const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); |
|
3490 const __m128i final_rounding = _mm_set1_epi16(1<<5); |
|
3491 |
|
3492 // idct constants for each stage |
|
3493 const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); |
|
3494 const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); |
|
3495 const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); |
|
3496 const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); |
|
3497 const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); |
|
3498 const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); |
|
3499 const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); |
|
3500 const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); |
|
3501 const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); |
|
3502 const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); |
|
3503 const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); |
|
3504 const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); |
|
3505 const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); |
|
3506 const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); |
|
3507 const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); |
|
3508 const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); |
|
3509 |
|
3510 const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); |
|
3511 const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); |
|
3512 const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); |
|
3513 const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); |
|
3514 const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); |
|
3515 const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); |
|
3516 const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); |
|
3517 const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); |
|
3518 |
|
3519 const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); |
|
3520 const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); |
|
3521 const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); |
|
3522 const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); |
|
3523 const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); |
|
3524 const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); |
|
3525 const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); |
|
3526 const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); |
|
3527 const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); |
|
3528 const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); |
|
3529 |
|
3530 const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); |
|
3531 const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); |
|
3532 const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); |
|
3533 const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); |
|
3534 const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); |
|
3535 const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); |
|
3536 const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); |
|
3537 |
|
3538 const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); |
|
3539 |
|
3540 __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, |
|
3541 in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23, |
|
3542 in24, in25, in26, in27, in28, in29, in30, in31; |
|
3543 __m128i col[128]; |
|
3544 __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, |
|
3545 stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, |
|
3546 stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, |
|
3547 stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, |
|
3548 stp1_30, stp1_31; |
|
3549 __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, |
|
3550 stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, |
|
3551 stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, |
|
3552 stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, |
|
3553 stp2_30, stp2_31; |
|
3554 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; |
|
3555 int i, j, i32; |
|
3556 __m128i zero_idx[16]; |
|
3557 int zero_flag[2]; |
|
3558 |
|
3559 // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct. |
|
3560 for (i = 0; i < 8; i++) { |
|
3561 i32 = (i << 5); |
|
3562 if (i < 4) { |
|
3563 // First 1-D idct |
|
3564 // Load input data. |
|
3565 LOAD_DQCOEFF(in0, input); |
|
3566 LOAD_DQCOEFF(in8, input); |
|
3567 LOAD_DQCOEFF(in16, input); |
|
3568 LOAD_DQCOEFF(in24, input); |
|
3569 LOAD_DQCOEFF(in1, input); |
|
3570 LOAD_DQCOEFF(in9, input); |
|
3571 LOAD_DQCOEFF(in17, input); |
|
3572 LOAD_DQCOEFF(in25, input); |
|
3573 LOAD_DQCOEFF(in2, input); |
|
3574 LOAD_DQCOEFF(in10, input); |
|
3575 LOAD_DQCOEFF(in18, input); |
|
3576 LOAD_DQCOEFF(in26, input); |
|
3577 LOAD_DQCOEFF(in3, input); |
|
3578 LOAD_DQCOEFF(in11, input); |
|
3579 LOAD_DQCOEFF(in19, input); |
|
3580 LOAD_DQCOEFF(in27, input); |
|
3581 |
|
3582 LOAD_DQCOEFF(in4, input); |
|
3583 LOAD_DQCOEFF(in12, input); |
|
3584 LOAD_DQCOEFF(in20, input); |
|
3585 LOAD_DQCOEFF(in28, input); |
|
3586 LOAD_DQCOEFF(in5, input); |
|
3587 LOAD_DQCOEFF(in13, input); |
|
3588 LOAD_DQCOEFF(in21, input); |
|
3589 LOAD_DQCOEFF(in29, input); |
|
3590 LOAD_DQCOEFF(in6, input); |
|
3591 LOAD_DQCOEFF(in14, input); |
|
3592 LOAD_DQCOEFF(in22, input); |
|
3593 LOAD_DQCOEFF(in30, input); |
|
3594 LOAD_DQCOEFF(in7, input); |
|
3595 LOAD_DQCOEFF(in15, input); |
|
3596 LOAD_DQCOEFF(in23, input); |
|
3597 LOAD_DQCOEFF(in31, input); |
|
3598 |
|
3599 // checking if all entries are zero |
|
3600 zero_idx[0] = _mm_or_si128(in0, in1); |
|
3601 zero_idx[1] = _mm_or_si128(in2, in3); |
|
3602 zero_idx[2] = _mm_or_si128(in4, in5); |
|
3603 zero_idx[3] = _mm_or_si128(in6, in7); |
|
3604 zero_idx[4] = _mm_or_si128(in8, in9); |
|
3605 zero_idx[5] = _mm_or_si128(in10, in11); |
|
3606 zero_idx[6] = _mm_or_si128(in12, in13); |
|
3607 zero_idx[7] = _mm_or_si128(in14, in15); |
|
3608 zero_idx[8] = _mm_or_si128(in16, in17); |
|
3609 zero_idx[9] = _mm_or_si128(in18, in19); |
|
3610 zero_idx[10] = _mm_or_si128(in20, in21); |
|
3611 zero_idx[11] = _mm_or_si128(in22, in23); |
|
3612 zero_idx[12] = _mm_or_si128(in24, in25); |
|
3613 zero_idx[13] = _mm_or_si128(in26, in27); |
|
3614 zero_idx[14] = _mm_or_si128(in28, in29); |
|
3615 zero_idx[15] = _mm_or_si128(in30, in31); |
|
3616 |
|
3617 zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]); |
|
3618 zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]); |
|
3619 zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]); |
|
3620 zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]); |
|
3621 zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]); |
|
3622 zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]); |
|
3623 zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]); |
|
3624 zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]); |
|
3625 |
|
3626 zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]); |
|
3627 zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]); |
|
3628 zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]); |
|
3629 zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]); |
|
3630 zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]); |
|
3631 zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]); |
|
3632 zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]); |
|
3633 |
|
3634 zero_idx[0] = _mm_unpackhi_epi64(zero_idx[14], zero_idx[14]); |
|
3635 zero_idx[1] = _mm_or_si128(zero_idx[0], zero_idx[14]); |
|
3636 zero_idx[2] = _mm_srli_epi64(zero_idx[1], 32); |
|
3637 zero_flag[0] = _mm_cvtsi128_si32(zero_idx[1]); |
|
3638 zero_flag[1] = _mm_cvtsi128_si32(zero_idx[2]); |
|
3639 |
|
3640 if (!zero_flag[0] && !zero_flag[1]) { |
|
3641 col[i32 + 0] = _mm_setzero_si128(); |
|
3642 col[i32 + 1] = _mm_setzero_si128(); |
|
3643 col[i32 + 2] = _mm_setzero_si128(); |
|
3644 col[i32 + 3] = _mm_setzero_si128(); |
|
3645 col[i32 + 4] = _mm_setzero_si128(); |
|
3646 col[i32 + 5] = _mm_setzero_si128(); |
|
3647 col[i32 + 6] = _mm_setzero_si128(); |
|
3648 col[i32 + 7] = _mm_setzero_si128(); |
|
3649 col[i32 + 8] = _mm_setzero_si128(); |
|
3650 col[i32 + 9] = _mm_setzero_si128(); |
|
3651 col[i32 + 10] = _mm_setzero_si128(); |
|
3652 col[i32 + 11] = _mm_setzero_si128(); |
|
3653 col[i32 + 12] = _mm_setzero_si128(); |
|
3654 col[i32 + 13] = _mm_setzero_si128(); |
|
3655 col[i32 + 14] = _mm_setzero_si128(); |
|
3656 col[i32 + 15] = _mm_setzero_si128(); |
|
3657 col[i32 + 16] = _mm_setzero_si128(); |
|
3658 col[i32 + 17] = _mm_setzero_si128(); |
|
3659 col[i32 + 18] = _mm_setzero_si128(); |
|
3660 col[i32 + 19] = _mm_setzero_si128(); |
|
3661 col[i32 + 20] = _mm_setzero_si128(); |
|
3662 col[i32 + 21] = _mm_setzero_si128(); |
|
3663 col[i32 + 22] = _mm_setzero_si128(); |
|
3664 col[i32 + 23] = _mm_setzero_si128(); |
|
3665 col[i32 + 24] = _mm_setzero_si128(); |
|
3666 col[i32 + 25] = _mm_setzero_si128(); |
|
3667 col[i32 + 26] = _mm_setzero_si128(); |
|
3668 col[i32 + 27] = _mm_setzero_si128(); |
|
3669 col[i32 + 28] = _mm_setzero_si128(); |
|
3670 col[i32 + 29] = _mm_setzero_si128(); |
|
3671 col[i32 + 30] = _mm_setzero_si128(); |
|
3672 col[i32 + 31] = _mm_setzero_si128(); |
|
3673 continue; |
|
3674 } |
|
3675 |
|
3676 // Transpose 32x8 block to 8x32 block |
|
3677 TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, |
|
3678 in4, in5, in6, in7); |
|
3679 TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, |
|
3680 in10, in11, in12, in13, in14, in15); |
|
3681 TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17, |
|
3682 in18, in19, in20, in21, in22, in23); |
|
3683 TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25, |
|
3684 in26, in27, in28, in29, in30, in31); |
|
3685 } else { |
|
3686 // Second 1-D idct |
|
3687 j = i - 4; |
|
3688 |
|
3689 // Transpose 32x8 block to 8x32 block |
|
3690 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
|
3691 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
|
3692 col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4, |
|
3693 in5, in6, in7); |
|
3694 j += 4; |
|
3695 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
|
3696 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
|
3697 col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10, |
|
3698 in11, in12, in13, in14, in15); |
|
3699 j += 4; |
|
3700 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
|
3701 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
|
3702 col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18, |
|
3703 in19, in20, in21, in22, in23); |
|
3704 j += 4; |
|
3705 TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2], |
|
3706 col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5], |
|
3707 col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27, |
|
3708 in28, in29, in30, in31); |
|
3709 } |
|
3710 |
|
3711 IDCT32_1D |
|
3712 |
|
3713 // final stage |
|
3714 if (i < 4) { |
|
3715 // 1_D: Store 32 intermediate results for each 8x32 block. |
|
3716 col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31); |
|
3717 col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30); |
|
3718 col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29); |
|
3719 col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28); |
|
3720 col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27); |
|
3721 col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26); |
|
3722 col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25); |
|
3723 col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24); |
|
3724 col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23); |
|
3725 col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22); |
|
3726 col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21); |
|
3727 col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20); |
|
3728 col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19); |
|
3729 col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18); |
|
3730 col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17); |
|
3731 col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16); |
|
3732 col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16); |
|
3733 col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17); |
|
3734 col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18); |
|
3735 col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19); |
|
3736 col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20); |
|
3737 col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21); |
|
3738 col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22); |
|
3739 col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23); |
|
3740 col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24); |
|
3741 col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25); |
|
3742 col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26); |
|
3743 col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27); |
|
3744 col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28); |
|
3745 col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29); |
|
3746 col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30); |
|
3747 col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31); |
|
3748 } else { |
|
3749 const __m128i zero = _mm_setzero_si128(); |
|
3750 |
|
3751 // 2_D: Calculate the results and store them to destination. |
|
3752 in0 = _mm_add_epi16(stp1_0, stp1_31); |
|
3753 in1 = _mm_add_epi16(stp1_1, stp1_30); |
|
3754 in2 = _mm_add_epi16(stp1_2, stp1_29); |
|
3755 in3 = _mm_add_epi16(stp1_3, stp1_28); |
|
3756 in4 = _mm_add_epi16(stp1_4, stp1_27); |
|
3757 in5 = _mm_add_epi16(stp1_5, stp1_26); |
|
3758 in6 = _mm_add_epi16(stp1_6, stp1_25); |
|
3759 in7 = _mm_add_epi16(stp1_7, stp1_24); |
|
3760 in8 = _mm_add_epi16(stp1_8, stp1_23); |
|
3761 in9 = _mm_add_epi16(stp1_9, stp1_22); |
|
3762 in10 = _mm_add_epi16(stp1_10, stp1_21); |
|
3763 in11 = _mm_add_epi16(stp1_11, stp1_20); |
|
3764 in12 = _mm_add_epi16(stp1_12, stp1_19); |
|
3765 in13 = _mm_add_epi16(stp1_13, stp1_18); |
|
3766 in14 = _mm_add_epi16(stp1_14, stp1_17); |
|
3767 in15 = _mm_add_epi16(stp1_15, stp1_16); |
|
3768 in16 = _mm_sub_epi16(stp1_15, stp1_16); |
|
3769 in17 = _mm_sub_epi16(stp1_14, stp1_17); |
|
3770 in18 = _mm_sub_epi16(stp1_13, stp1_18); |
|
3771 in19 = _mm_sub_epi16(stp1_12, stp1_19); |
|
3772 in20 = _mm_sub_epi16(stp1_11, stp1_20); |
|
3773 in21 = _mm_sub_epi16(stp1_10, stp1_21); |
|
3774 in22 = _mm_sub_epi16(stp1_9, stp1_22); |
|
3775 in23 = _mm_sub_epi16(stp1_8, stp1_23); |
|
3776 in24 = _mm_sub_epi16(stp1_7, stp1_24); |
|
3777 in25 = _mm_sub_epi16(stp1_6, stp1_25); |
|
3778 in26 = _mm_sub_epi16(stp1_5, stp1_26); |
|
3779 in27 = _mm_sub_epi16(stp1_4, stp1_27); |
|
3780 in28 = _mm_sub_epi16(stp1_3, stp1_28); |
|
3781 in29 = _mm_sub_epi16(stp1_2, stp1_29); |
|
3782 in30 = _mm_sub_epi16(stp1_1, stp1_30); |
|
3783 in31 = _mm_sub_epi16(stp1_0, stp1_31); |
|
3784 |
|
3785 // Final rounding and shift |
|
3786 in0 = _mm_adds_epi16(in0, final_rounding); |
|
3787 in1 = _mm_adds_epi16(in1, final_rounding); |
|
3788 in2 = _mm_adds_epi16(in2, final_rounding); |
|
3789 in3 = _mm_adds_epi16(in3, final_rounding); |
|
3790 in4 = _mm_adds_epi16(in4, final_rounding); |
|
3791 in5 = _mm_adds_epi16(in5, final_rounding); |
|
3792 in6 = _mm_adds_epi16(in6, final_rounding); |
|
3793 in7 = _mm_adds_epi16(in7, final_rounding); |
|
3794 in8 = _mm_adds_epi16(in8, final_rounding); |
|
3795 in9 = _mm_adds_epi16(in9, final_rounding); |
|
3796 in10 = _mm_adds_epi16(in10, final_rounding); |
|
3797 in11 = _mm_adds_epi16(in11, final_rounding); |
|
3798 in12 = _mm_adds_epi16(in12, final_rounding); |
|
3799 in13 = _mm_adds_epi16(in13, final_rounding); |
|
3800 in14 = _mm_adds_epi16(in14, final_rounding); |
|
3801 in15 = _mm_adds_epi16(in15, final_rounding); |
|
3802 in16 = _mm_adds_epi16(in16, final_rounding); |
|
3803 in17 = _mm_adds_epi16(in17, final_rounding); |
|
3804 in18 = _mm_adds_epi16(in18, final_rounding); |
|
3805 in19 = _mm_adds_epi16(in19, final_rounding); |
|
3806 in20 = _mm_adds_epi16(in20, final_rounding); |
|
3807 in21 = _mm_adds_epi16(in21, final_rounding); |
|
3808 in22 = _mm_adds_epi16(in22, final_rounding); |
|
3809 in23 = _mm_adds_epi16(in23, final_rounding); |
|
3810 in24 = _mm_adds_epi16(in24, final_rounding); |
|
3811 in25 = _mm_adds_epi16(in25, final_rounding); |
|
3812 in26 = _mm_adds_epi16(in26, final_rounding); |
|
3813 in27 = _mm_adds_epi16(in27, final_rounding); |
|
3814 in28 = _mm_adds_epi16(in28, final_rounding); |
|
3815 in29 = _mm_adds_epi16(in29, final_rounding); |
|
3816 in30 = _mm_adds_epi16(in30, final_rounding); |
|
3817 in31 = _mm_adds_epi16(in31, final_rounding); |
|
3818 |
|
3819 in0 = _mm_srai_epi16(in0, 6); |
|
3820 in1 = _mm_srai_epi16(in1, 6); |
|
3821 in2 = _mm_srai_epi16(in2, 6); |
|
3822 in3 = _mm_srai_epi16(in3, 6); |
|
3823 in4 = _mm_srai_epi16(in4, 6); |
|
3824 in5 = _mm_srai_epi16(in5, 6); |
|
3825 in6 = _mm_srai_epi16(in6, 6); |
|
3826 in7 = _mm_srai_epi16(in7, 6); |
|
3827 in8 = _mm_srai_epi16(in8, 6); |
|
3828 in9 = _mm_srai_epi16(in9, 6); |
|
3829 in10 = _mm_srai_epi16(in10, 6); |
|
3830 in11 = _mm_srai_epi16(in11, 6); |
|
3831 in12 = _mm_srai_epi16(in12, 6); |
|
3832 in13 = _mm_srai_epi16(in13, 6); |
|
3833 in14 = _mm_srai_epi16(in14, 6); |
|
3834 in15 = _mm_srai_epi16(in15, 6); |
|
3835 in16 = _mm_srai_epi16(in16, 6); |
|
3836 in17 = _mm_srai_epi16(in17, 6); |
|
3837 in18 = _mm_srai_epi16(in18, 6); |
|
3838 in19 = _mm_srai_epi16(in19, 6); |
|
3839 in20 = _mm_srai_epi16(in20, 6); |
|
3840 in21 = _mm_srai_epi16(in21, 6); |
|
3841 in22 = _mm_srai_epi16(in22, 6); |
|
3842 in23 = _mm_srai_epi16(in23, 6); |
|
3843 in24 = _mm_srai_epi16(in24, 6); |
|
3844 in25 = _mm_srai_epi16(in25, 6); |
|
3845 in26 = _mm_srai_epi16(in26, 6); |
|
3846 in27 = _mm_srai_epi16(in27, 6); |
|
3847 in28 = _mm_srai_epi16(in28, 6); |
|
3848 in29 = _mm_srai_epi16(in29, 6); |
|
3849 in30 = _mm_srai_epi16(in30, 6); |
|
3850 in31 = _mm_srai_epi16(in31, 6); |
|
3851 |
|
3852 RECON_AND_STORE(dest, in0); |
|
3853 RECON_AND_STORE(dest, in1); |
|
3854 RECON_AND_STORE(dest, in2); |
|
3855 RECON_AND_STORE(dest, in3); |
|
3856 RECON_AND_STORE(dest, in4); |
|
3857 RECON_AND_STORE(dest, in5); |
|
3858 RECON_AND_STORE(dest, in6); |
|
3859 RECON_AND_STORE(dest, in7); |
|
3860 RECON_AND_STORE(dest, in8); |
|
3861 RECON_AND_STORE(dest, in9); |
|
3862 RECON_AND_STORE(dest, in10); |
|
3863 RECON_AND_STORE(dest, in11); |
|
3864 RECON_AND_STORE(dest, in12); |
|
3865 RECON_AND_STORE(dest, in13); |
|
3866 RECON_AND_STORE(dest, in14); |
|
3867 RECON_AND_STORE(dest, in15); |
|
3868 RECON_AND_STORE(dest, in16); |
|
3869 RECON_AND_STORE(dest, in17); |
|
3870 RECON_AND_STORE(dest, in18); |
|
3871 RECON_AND_STORE(dest, in19); |
|
3872 RECON_AND_STORE(dest, in20); |
|
3873 RECON_AND_STORE(dest, in21); |
|
3874 RECON_AND_STORE(dest, in22); |
|
3875 RECON_AND_STORE(dest, in23); |
|
3876 RECON_AND_STORE(dest, in24); |
|
3877 RECON_AND_STORE(dest, in25); |
|
3878 RECON_AND_STORE(dest, in26); |
|
3879 RECON_AND_STORE(dest, in27); |
|
3880 RECON_AND_STORE(dest, in28); |
|
3881 RECON_AND_STORE(dest, in29); |
|
3882 RECON_AND_STORE(dest, in30); |
|
3883 RECON_AND_STORE(dest, in31); |
|
3884 |
|
3885 dest += 8 - (stride * 32); |
|
3886 } |
|
3887 } |
|
3888 } //NOLINT |
|
3889 |
|
3890 void vp9_idct32x32_1_add_sse2(const int16_t *input, uint8_t *dest, int stride) { |
|
3891 __m128i dc_value; |
|
3892 const __m128i zero = _mm_setzero_si128(); |
|
3893 int a, i; |
|
3894 |
|
3895 a = dct_const_round_shift(input[0] * cospi_16_64); |
|
3896 a = dct_const_round_shift(a * cospi_16_64); |
|
3897 a = ROUND_POWER_OF_TWO(a, 6); |
|
3898 |
|
3899 dc_value = _mm_set1_epi16(a); |
|
3900 |
|
3901 for (i = 0; i < 4; ++i) { |
|
3902 RECON_AND_STORE(dest, dc_value); |
|
3903 RECON_AND_STORE(dest, dc_value); |
|
3904 RECON_AND_STORE(dest, dc_value); |
|
3905 RECON_AND_STORE(dest, dc_value); |
|
3906 RECON_AND_STORE(dest, dc_value); |
|
3907 RECON_AND_STORE(dest, dc_value); |
|
3908 RECON_AND_STORE(dest, dc_value); |
|
3909 RECON_AND_STORE(dest, dc_value); |
|
3910 RECON_AND_STORE(dest, dc_value); |
|
3911 RECON_AND_STORE(dest, dc_value); |
|
3912 RECON_AND_STORE(dest, dc_value); |
|
3913 RECON_AND_STORE(dest, dc_value); |
|
3914 RECON_AND_STORE(dest, dc_value); |
|
3915 RECON_AND_STORE(dest, dc_value); |
|
3916 RECON_AND_STORE(dest, dc_value); |
|
3917 RECON_AND_STORE(dest, dc_value); |
|
3918 RECON_AND_STORE(dest, dc_value); |
|
3919 RECON_AND_STORE(dest, dc_value); |
|
3920 RECON_AND_STORE(dest, dc_value); |
|
3921 RECON_AND_STORE(dest, dc_value); |
|
3922 RECON_AND_STORE(dest, dc_value); |
|
3923 RECON_AND_STORE(dest, dc_value); |
|
3924 RECON_AND_STORE(dest, dc_value); |
|
3925 RECON_AND_STORE(dest, dc_value); |
|
3926 RECON_AND_STORE(dest, dc_value); |
|
3927 RECON_AND_STORE(dest, dc_value); |
|
3928 RECON_AND_STORE(dest, dc_value); |
|
3929 RECON_AND_STORE(dest, dc_value); |
|
3930 RECON_AND_STORE(dest, dc_value); |
|
3931 RECON_AND_STORE(dest, dc_value); |
|
3932 RECON_AND_STORE(dest, dc_value); |
|
3933 RECON_AND_STORE(dest, dc_value); |
|
3934 dest += 8 - (stride * 32); |
|
3935 } |
|
3936 } |