|
1 /* |
|
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 * |
|
4 * Use of this source code is governed by a BSD-style license |
|
5 * that can be found in the LICENSE file in the root of the source |
|
6 * tree. An additional intellectual property rights grant can be found |
|
7 * in the file PATENTS. All contributing project authors may |
|
8 * be found in the AUTHORS file in the root of the source tree. |
|
9 */ |
|
10 |
|
11 #include <assert.h> |
|
12 #include <math.h> |
|
13 |
|
14 #include "./vpx_config.h" |
|
15 #include "./vp9_rtcd.h" |
|
16 #include "vp9/common/vp9_systemdependent.h" |
|
17 #include "vp9/common/vp9_blockd.h" |
|
18 #include "vp9/common/vp9_common.h" |
|
19 #include "vp9/common/vp9_idct.h" |
|
20 |
|
21 void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { |
|
22 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, |
|
23 0.5 shifts per pixel. */ |
|
24 int i; |
|
25 int16_t output[16]; |
|
26 int a1, b1, c1, d1, e1; |
|
27 const int16_t *ip = input; |
|
28 int16_t *op = output; |
|
29 |
|
30 for (i = 0; i < 4; i++) { |
|
31 a1 = ip[0] >> UNIT_QUANT_SHIFT; |
|
32 c1 = ip[1] >> UNIT_QUANT_SHIFT; |
|
33 d1 = ip[2] >> UNIT_QUANT_SHIFT; |
|
34 b1 = ip[3] >> UNIT_QUANT_SHIFT; |
|
35 a1 += c1; |
|
36 d1 -= b1; |
|
37 e1 = (a1 - d1) >> 1; |
|
38 b1 = e1 - b1; |
|
39 c1 = e1 - c1; |
|
40 a1 -= b1; |
|
41 d1 += c1; |
|
42 op[0] = a1; |
|
43 op[1] = b1; |
|
44 op[2] = c1; |
|
45 op[3] = d1; |
|
46 ip += 4; |
|
47 op += 4; |
|
48 } |
|
49 |
|
50 ip = output; |
|
51 for (i = 0; i < 4; i++) { |
|
52 a1 = ip[4 * 0]; |
|
53 c1 = ip[4 * 1]; |
|
54 d1 = ip[4 * 2]; |
|
55 b1 = ip[4 * 3]; |
|
56 a1 += c1; |
|
57 d1 -= b1; |
|
58 e1 = (a1 - d1) >> 1; |
|
59 b1 = e1 - b1; |
|
60 c1 = e1 - c1; |
|
61 a1 -= b1; |
|
62 d1 += c1; |
|
63 dest[stride * 0] = clip_pixel(dest[stride * 0] + a1); |
|
64 dest[stride * 1] = clip_pixel(dest[stride * 1] + b1); |
|
65 dest[stride * 2] = clip_pixel(dest[stride * 2] + c1); |
|
66 dest[stride * 3] = clip_pixel(dest[stride * 3] + d1); |
|
67 |
|
68 ip++; |
|
69 dest++; |
|
70 } |
|
71 } |
|
72 |
|
73 void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) { |
|
74 int i; |
|
75 int a1, e1; |
|
76 int16_t tmp[4]; |
|
77 const int16_t *ip = in; |
|
78 int16_t *op = tmp; |
|
79 |
|
80 a1 = ip[0] >> UNIT_QUANT_SHIFT; |
|
81 e1 = a1 >> 1; |
|
82 a1 -= e1; |
|
83 op[0] = a1; |
|
84 op[1] = op[2] = op[3] = e1; |
|
85 |
|
86 ip = tmp; |
|
87 for (i = 0; i < 4; i++) { |
|
88 e1 = ip[0] >> 1; |
|
89 a1 = ip[0] - e1; |
|
90 dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1); |
|
91 dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1); |
|
92 dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1); |
|
93 dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1); |
|
94 ip++; |
|
95 dest++; |
|
96 } |
|
97 } |
|
98 |
|
99 static void idct4_1d(const int16_t *input, int16_t *output) { |
|
100 int16_t step[4]; |
|
101 int temp1, temp2; |
|
102 // stage 1 |
|
103 temp1 = (input[0] + input[2]) * cospi_16_64; |
|
104 temp2 = (input[0] - input[2]) * cospi_16_64; |
|
105 step[0] = dct_const_round_shift(temp1); |
|
106 step[1] = dct_const_round_shift(temp2); |
|
107 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; |
|
108 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; |
|
109 step[2] = dct_const_round_shift(temp1); |
|
110 step[3] = dct_const_round_shift(temp2); |
|
111 |
|
112 // stage 2 |
|
113 output[0] = step[0] + step[3]; |
|
114 output[1] = step[1] + step[2]; |
|
115 output[2] = step[1] - step[2]; |
|
116 output[3] = step[0] - step[3]; |
|
117 } |
|
118 |
|
119 void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { |
|
120 int16_t out[4 * 4]; |
|
121 int16_t *outptr = out; |
|
122 int i, j; |
|
123 int16_t temp_in[4], temp_out[4]; |
|
124 |
|
125 // Rows |
|
126 for (i = 0; i < 4; ++i) { |
|
127 idct4_1d(input, outptr); |
|
128 input += 4; |
|
129 outptr += 4; |
|
130 } |
|
131 |
|
132 // Columns |
|
133 for (i = 0; i < 4; ++i) { |
|
134 for (j = 0; j < 4; ++j) |
|
135 temp_in[j] = out[j * 4 + i]; |
|
136 idct4_1d(temp_in, temp_out); |
|
137 for (j = 0; j < 4; ++j) |
|
138 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) |
|
139 + dest[j * stride + i]); |
|
140 } |
|
141 } |
|
142 |
|
143 void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) { |
|
144 int i; |
|
145 int a1; |
|
146 int16_t out = dct_const_round_shift(input[0] * cospi_16_64); |
|
147 out = dct_const_round_shift(out * cospi_16_64); |
|
148 a1 = ROUND_POWER_OF_TWO(out, 4); |
|
149 |
|
150 for (i = 0; i < 4; i++) { |
|
151 dest[0] = clip_pixel(dest[0] + a1); |
|
152 dest[1] = clip_pixel(dest[1] + a1); |
|
153 dest[2] = clip_pixel(dest[2] + a1); |
|
154 dest[3] = clip_pixel(dest[3] + a1); |
|
155 dest += dest_stride; |
|
156 } |
|
157 } |
|
158 |
|
159 static void idct8_1d(const int16_t *input, int16_t *output) { |
|
160 int16_t step1[8], step2[8]; |
|
161 int temp1, temp2; |
|
162 // stage 1 |
|
163 step1[0] = input[0]; |
|
164 step1[2] = input[4]; |
|
165 step1[1] = input[2]; |
|
166 step1[3] = input[6]; |
|
167 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; |
|
168 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; |
|
169 step1[4] = dct_const_round_shift(temp1); |
|
170 step1[7] = dct_const_round_shift(temp2); |
|
171 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; |
|
172 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; |
|
173 step1[5] = dct_const_round_shift(temp1); |
|
174 step1[6] = dct_const_round_shift(temp2); |
|
175 |
|
176 // stage 2 & stage 3 - even half |
|
177 idct4_1d(step1, step1); |
|
178 |
|
179 // stage 2 - odd half |
|
180 step2[4] = step1[4] + step1[5]; |
|
181 step2[5] = step1[4] - step1[5]; |
|
182 step2[6] = -step1[6] + step1[7]; |
|
183 step2[7] = step1[6] + step1[7]; |
|
184 |
|
185 // stage 3 -odd half |
|
186 step1[4] = step2[4]; |
|
187 temp1 = (step2[6] - step2[5]) * cospi_16_64; |
|
188 temp2 = (step2[5] + step2[6]) * cospi_16_64; |
|
189 step1[5] = dct_const_round_shift(temp1); |
|
190 step1[6] = dct_const_round_shift(temp2); |
|
191 step1[7] = step2[7]; |
|
192 |
|
193 // stage 4 |
|
194 output[0] = step1[0] + step1[7]; |
|
195 output[1] = step1[1] + step1[6]; |
|
196 output[2] = step1[2] + step1[5]; |
|
197 output[3] = step1[3] + step1[4]; |
|
198 output[4] = step1[3] - step1[4]; |
|
199 output[5] = step1[2] - step1[5]; |
|
200 output[6] = step1[1] - step1[6]; |
|
201 output[7] = step1[0] - step1[7]; |
|
202 } |
|
203 |
|
204 void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) { |
|
205 int16_t out[8 * 8]; |
|
206 int16_t *outptr = out; |
|
207 int i, j; |
|
208 int16_t temp_in[8], temp_out[8]; |
|
209 |
|
210 // First transform rows |
|
211 for (i = 0; i < 8; ++i) { |
|
212 idct8_1d(input, outptr); |
|
213 input += 8; |
|
214 outptr += 8; |
|
215 } |
|
216 |
|
217 // Then transform columns |
|
218 for (i = 0; i < 8; ++i) { |
|
219 for (j = 0; j < 8; ++j) |
|
220 temp_in[j] = out[j * 8 + i]; |
|
221 idct8_1d(temp_in, temp_out); |
|
222 for (j = 0; j < 8; ++j) |
|
223 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) |
|
224 + dest[j * stride + i]); |
|
225 } |
|
226 } |
|
227 |
|
228 void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) { |
|
229 int i, j; |
|
230 int a1; |
|
231 int16_t out = dct_const_round_shift(input[0] * cospi_16_64); |
|
232 out = dct_const_round_shift(out * cospi_16_64); |
|
233 a1 = ROUND_POWER_OF_TWO(out, 5); |
|
234 for (j = 0; j < 8; ++j) { |
|
235 for (i = 0; i < 8; ++i) |
|
236 dest[i] = clip_pixel(dest[i] + a1); |
|
237 dest += stride; |
|
238 } |
|
239 } |
|
240 |
|
241 static void iadst4_1d(const int16_t *input, int16_t *output) { |
|
242 int s0, s1, s2, s3, s4, s5, s6, s7; |
|
243 |
|
244 int x0 = input[0]; |
|
245 int x1 = input[1]; |
|
246 int x2 = input[2]; |
|
247 int x3 = input[3]; |
|
248 |
|
249 if (!(x0 | x1 | x2 | x3)) { |
|
250 output[0] = output[1] = output[2] = output[3] = 0; |
|
251 return; |
|
252 } |
|
253 |
|
254 s0 = sinpi_1_9 * x0; |
|
255 s1 = sinpi_2_9 * x0; |
|
256 s2 = sinpi_3_9 * x1; |
|
257 s3 = sinpi_4_9 * x2; |
|
258 s4 = sinpi_1_9 * x2; |
|
259 s5 = sinpi_2_9 * x3; |
|
260 s6 = sinpi_4_9 * x3; |
|
261 s7 = x0 - x2 + x3; |
|
262 |
|
263 x0 = s0 + s3 + s5; |
|
264 x1 = s1 - s4 - s6; |
|
265 x2 = sinpi_3_9 * s7; |
|
266 x3 = s2; |
|
267 |
|
268 s0 = x0 + x3; |
|
269 s1 = x1 + x3; |
|
270 s2 = x2; |
|
271 s3 = x0 + x1 - x3; |
|
272 |
|
273 // 1-D transform scaling factor is sqrt(2). |
|
274 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) |
|
275 // + 1b (addition) = 29b. |
|
276 // Hence the output bit depth is 15b. |
|
277 output[0] = dct_const_round_shift(s0); |
|
278 output[1] = dct_const_round_shift(s1); |
|
279 output[2] = dct_const_round_shift(s2); |
|
280 output[3] = dct_const_round_shift(s3); |
|
281 } |
|
282 |
|
283 void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride, |
|
284 int tx_type) { |
|
285 const transform_2d IHT_4[] = { |
|
286 { idct4_1d, idct4_1d }, // DCT_DCT = 0 |
|
287 { iadst4_1d, idct4_1d }, // ADST_DCT = 1 |
|
288 { idct4_1d, iadst4_1d }, // DCT_ADST = 2 |
|
289 { iadst4_1d, iadst4_1d } // ADST_ADST = 3 |
|
290 }; |
|
291 |
|
292 int i, j; |
|
293 int16_t out[4 * 4]; |
|
294 int16_t *outptr = out; |
|
295 int16_t temp_in[4], temp_out[4]; |
|
296 |
|
297 // inverse transform row vectors |
|
298 for (i = 0; i < 4; ++i) { |
|
299 IHT_4[tx_type].rows(input, outptr); |
|
300 input += 4; |
|
301 outptr += 4; |
|
302 } |
|
303 |
|
304 // inverse transform column vectors |
|
305 for (i = 0; i < 4; ++i) { |
|
306 for (j = 0; j < 4; ++j) |
|
307 temp_in[j] = out[j * 4 + i]; |
|
308 IHT_4[tx_type].cols(temp_in, temp_out); |
|
309 for (j = 0; j < 4; ++j) |
|
310 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) |
|
311 + dest[j * stride + i]); |
|
312 } |
|
313 } |
|
314 static void iadst8_1d(const int16_t *input, int16_t *output) { |
|
315 int s0, s1, s2, s3, s4, s5, s6, s7; |
|
316 |
|
317 int x0 = input[7]; |
|
318 int x1 = input[0]; |
|
319 int x2 = input[5]; |
|
320 int x3 = input[2]; |
|
321 int x4 = input[3]; |
|
322 int x5 = input[4]; |
|
323 int x6 = input[1]; |
|
324 int x7 = input[6]; |
|
325 |
|
326 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { |
|
327 output[0] = output[1] = output[2] = output[3] = output[4] |
|
328 = output[5] = output[6] = output[7] = 0; |
|
329 return; |
|
330 } |
|
331 |
|
332 // stage 1 |
|
333 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; |
|
334 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; |
|
335 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; |
|
336 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; |
|
337 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; |
|
338 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; |
|
339 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; |
|
340 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; |
|
341 |
|
342 x0 = dct_const_round_shift(s0 + s4); |
|
343 x1 = dct_const_round_shift(s1 + s5); |
|
344 x2 = dct_const_round_shift(s2 + s6); |
|
345 x3 = dct_const_round_shift(s3 + s7); |
|
346 x4 = dct_const_round_shift(s0 - s4); |
|
347 x5 = dct_const_round_shift(s1 - s5); |
|
348 x6 = dct_const_round_shift(s2 - s6); |
|
349 x7 = dct_const_round_shift(s3 - s7); |
|
350 |
|
351 // stage 2 |
|
352 s0 = x0; |
|
353 s1 = x1; |
|
354 s2 = x2; |
|
355 s3 = x3; |
|
356 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; |
|
357 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; |
|
358 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; |
|
359 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; |
|
360 |
|
361 x0 = s0 + s2; |
|
362 x1 = s1 + s3; |
|
363 x2 = s0 - s2; |
|
364 x3 = s1 - s3; |
|
365 x4 = dct_const_round_shift(s4 + s6); |
|
366 x5 = dct_const_round_shift(s5 + s7); |
|
367 x6 = dct_const_round_shift(s4 - s6); |
|
368 x7 = dct_const_round_shift(s5 - s7); |
|
369 |
|
370 // stage 3 |
|
371 s2 = cospi_16_64 * (x2 + x3); |
|
372 s3 = cospi_16_64 * (x2 - x3); |
|
373 s6 = cospi_16_64 * (x6 + x7); |
|
374 s7 = cospi_16_64 * (x6 - x7); |
|
375 |
|
376 x2 = dct_const_round_shift(s2); |
|
377 x3 = dct_const_round_shift(s3); |
|
378 x6 = dct_const_round_shift(s6); |
|
379 x7 = dct_const_round_shift(s7); |
|
380 |
|
381 output[0] = x0; |
|
382 output[1] = -x4; |
|
383 output[2] = x6; |
|
384 output[3] = -x2; |
|
385 output[4] = x3; |
|
386 output[5] = -x7; |
|
387 output[6] = x5; |
|
388 output[7] = -x1; |
|
389 } |
|
390 |
|
391 static const transform_2d IHT_8[] = { |
|
392 { idct8_1d, idct8_1d }, // DCT_DCT = 0 |
|
393 { iadst8_1d, idct8_1d }, // ADST_DCT = 1 |
|
394 { idct8_1d, iadst8_1d }, // DCT_ADST = 2 |
|
395 { iadst8_1d, iadst8_1d } // ADST_ADST = 3 |
|
396 }; |
|
397 |
|
398 void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride, |
|
399 int tx_type) { |
|
400 int i, j; |
|
401 int16_t out[8 * 8]; |
|
402 int16_t *outptr = out; |
|
403 int16_t temp_in[8], temp_out[8]; |
|
404 const transform_2d ht = IHT_8[tx_type]; |
|
405 |
|
406 // inverse transform row vectors |
|
407 for (i = 0; i < 8; ++i) { |
|
408 ht.rows(input, outptr); |
|
409 input += 8; |
|
410 outptr += 8; |
|
411 } |
|
412 |
|
413 // inverse transform column vectors |
|
414 for (i = 0; i < 8; ++i) { |
|
415 for (j = 0; j < 8; ++j) |
|
416 temp_in[j] = out[j * 8 + i]; |
|
417 ht.cols(temp_in, temp_out); |
|
418 for (j = 0; j < 8; ++j) |
|
419 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) |
|
420 + dest[j * stride + i]); |
|
421 } |
|
422 } |
|
423 |
|
424 void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) { |
|
425 int16_t out[8 * 8] = { 0 }; |
|
426 int16_t *outptr = out; |
|
427 int i, j; |
|
428 int16_t temp_in[8], temp_out[8]; |
|
429 |
|
430 // First transform rows |
|
431 // only first 4 row has non-zero coefs |
|
432 for (i = 0; i < 4; ++i) { |
|
433 idct8_1d(input, outptr); |
|
434 input += 8; |
|
435 outptr += 8; |
|
436 } |
|
437 |
|
438 // Then transform columns |
|
439 for (i = 0; i < 8; ++i) { |
|
440 for (j = 0; j < 8; ++j) |
|
441 temp_in[j] = out[j * 8 + i]; |
|
442 idct8_1d(temp_in, temp_out); |
|
443 for (j = 0; j < 8; ++j) |
|
444 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) |
|
445 + dest[j * stride + i]); |
|
446 } |
|
447 } |
|
448 |
|
449 static void idct16_1d(const int16_t *input, int16_t *output) { |
|
450 int16_t step1[16], step2[16]; |
|
451 int temp1, temp2; |
|
452 |
|
453 // stage 1 |
|
454 step1[0] = input[0/2]; |
|
455 step1[1] = input[16/2]; |
|
456 step1[2] = input[8/2]; |
|
457 step1[3] = input[24/2]; |
|
458 step1[4] = input[4/2]; |
|
459 step1[5] = input[20/2]; |
|
460 step1[6] = input[12/2]; |
|
461 step1[7] = input[28/2]; |
|
462 step1[8] = input[2/2]; |
|
463 step1[9] = input[18/2]; |
|
464 step1[10] = input[10/2]; |
|
465 step1[11] = input[26/2]; |
|
466 step1[12] = input[6/2]; |
|
467 step1[13] = input[22/2]; |
|
468 step1[14] = input[14/2]; |
|
469 step1[15] = input[30/2]; |
|
470 |
|
471 // stage 2 |
|
472 step2[0] = step1[0]; |
|
473 step2[1] = step1[1]; |
|
474 step2[2] = step1[2]; |
|
475 step2[3] = step1[3]; |
|
476 step2[4] = step1[4]; |
|
477 step2[5] = step1[5]; |
|
478 step2[6] = step1[6]; |
|
479 step2[7] = step1[7]; |
|
480 |
|
481 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; |
|
482 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; |
|
483 step2[8] = dct_const_round_shift(temp1); |
|
484 step2[15] = dct_const_round_shift(temp2); |
|
485 |
|
486 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; |
|
487 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; |
|
488 step2[9] = dct_const_round_shift(temp1); |
|
489 step2[14] = dct_const_round_shift(temp2); |
|
490 |
|
491 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; |
|
492 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; |
|
493 step2[10] = dct_const_round_shift(temp1); |
|
494 step2[13] = dct_const_round_shift(temp2); |
|
495 |
|
496 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; |
|
497 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; |
|
498 step2[11] = dct_const_round_shift(temp1); |
|
499 step2[12] = dct_const_round_shift(temp2); |
|
500 |
|
501 // stage 3 |
|
502 step1[0] = step2[0]; |
|
503 step1[1] = step2[1]; |
|
504 step1[2] = step2[2]; |
|
505 step1[3] = step2[3]; |
|
506 |
|
507 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; |
|
508 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; |
|
509 step1[4] = dct_const_round_shift(temp1); |
|
510 step1[7] = dct_const_round_shift(temp2); |
|
511 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; |
|
512 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; |
|
513 step1[5] = dct_const_round_shift(temp1); |
|
514 step1[6] = dct_const_round_shift(temp2); |
|
515 |
|
516 step1[8] = step2[8] + step2[9]; |
|
517 step1[9] = step2[8] - step2[9]; |
|
518 step1[10] = -step2[10] + step2[11]; |
|
519 step1[11] = step2[10] + step2[11]; |
|
520 step1[12] = step2[12] + step2[13]; |
|
521 step1[13] = step2[12] - step2[13]; |
|
522 step1[14] = -step2[14] + step2[15]; |
|
523 step1[15] = step2[14] + step2[15]; |
|
524 |
|
525 // stage 4 |
|
526 temp1 = (step1[0] + step1[1]) * cospi_16_64; |
|
527 temp2 = (step1[0] - step1[1]) * cospi_16_64; |
|
528 step2[0] = dct_const_round_shift(temp1); |
|
529 step2[1] = dct_const_round_shift(temp2); |
|
530 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; |
|
531 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; |
|
532 step2[2] = dct_const_round_shift(temp1); |
|
533 step2[3] = dct_const_round_shift(temp2); |
|
534 step2[4] = step1[4] + step1[5]; |
|
535 step2[5] = step1[4] - step1[5]; |
|
536 step2[6] = -step1[6] + step1[7]; |
|
537 step2[7] = step1[6] + step1[7]; |
|
538 |
|
539 step2[8] = step1[8]; |
|
540 step2[15] = step1[15]; |
|
541 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; |
|
542 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; |
|
543 step2[9] = dct_const_round_shift(temp1); |
|
544 step2[14] = dct_const_round_shift(temp2); |
|
545 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; |
|
546 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; |
|
547 step2[10] = dct_const_round_shift(temp1); |
|
548 step2[13] = dct_const_round_shift(temp2); |
|
549 step2[11] = step1[11]; |
|
550 step2[12] = step1[12]; |
|
551 |
|
552 // stage 5 |
|
553 step1[0] = step2[0] + step2[3]; |
|
554 step1[1] = step2[1] + step2[2]; |
|
555 step1[2] = step2[1] - step2[2]; |
|
556 step1[3] = step2[0] - step2[3]; |
|
557 step1[4] = step2[4]; |
|
558 temp1 = (step2[6] - step2[5]) * cospi_16_64; |
|
559 temp2 = (step2[5] + step2[6]) * cospi_16_64; |
|
560 step1[5] = dct_const_round_shift(temp1); |
|
561 step1[6] = dct_const_round_shift(temp2); |
|
562 step1[7] = step2[7]; |
|
563 |
|
564 step1[8] = step2[8] + step2[11]; |
|
565 step1[9] = step2[9] + step2[10]; |
|
566 step1[10] = step2[9] - step2[10]; |
|
567 step1[11] = step2[8] - step2[11]; |
|
568 step1[12] = -step2[12] + step2[15]; |
|
569 step1[13] = -step2[13] + step2[14]; |
|
570 step1[14] = step2[13] + step2[14]; |
|
571 step1[15] = step2[12] + step2[15]; |
|
572 |
|
573 // stage 6 |
|
574 step2[0] = step1[0] + step1[7]; |
|
575 step2[1] = step1[1] + step1[6]; |
|
576 step2[2] = step1[2] + step1[5]; |
|
577 step2[3] = step1[3] + step1[4]; |
|
578 step2[4] = step1[3] - step1[4]; |
|
579 step2[5] = step1[2] - step1[5]; |
|
580 step2[6] = step1[1] - step1[6]; |
|
581 step2[7] = step1[0] - step1[7]; |
|
582 step2[8] = step1[8]; |
|
583 step2[9] = step1[9]; |
|
584 temp1 = (-step1[10] + step1[13]) * cospi_16_64; |
|
585 temp2 = (step1[10] + step1[13]) * cospi_16_64; |
|
586 step2[10] = dct_const_round_shift(temp1); |
|
587 step2[13] = dct_const_round_shift(temp2); |
|
588 temp1 = (-step1[11] + step1[12]) * cospi_16_64; |
|
589 temp2 = (step1[11] + step1[12]) * cospi_16_64; |
|
590 step2[11] = dct_const_round_shift(temp1); |
|
591 step2[12] = dct_const_round_shift(temp2); |
|
592 step2[14] = step1[14]; |
|
593 step2[15] = step1[15]; |
|
594 |
|
595 // stage 7 |
|
596 output[0] = step2[0] + step2[15]; |
|
597 output[1] = step2[1] + step2[14]; |
|
598 output[2] = step2[2] + step2[13]; |
|
599 output[3] = step2[3] + step2[12]; |
|
600 output[4] = step2[4] + step2[11]; |
|
601 output[5] = step2[5] + step2[10]; |
|
602 output[6] = step2[6] + step2[9]; |
|
603 output[7] = step2[7] + step2[8]; |
|
604 output[8] = step2[7] - step2[8]; |
|
605 output[9] = step2[6] - step2[9]; |
|
606 output[10] = step2[5] - step2[10]; |
|
607 output[11] = step2[4] - step2[11]; |
|
608 output[12] = step2[3] - step2[12]; |
|
609 output[13] = step2[2] - step2[13]; |
|
610 output[14] = step2[1] - step2[14]; |
|
611 output[15] = step2[0] - step2[15]; |
|
612 } |
|
613 |
|
614 void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) { |
|
615 int16_t out[16 * 16]; |
|
616 int16_t *outptr = out; |
|
617 int i, j; |
|
618 int16_t temp_in[16], temp_out[16]; |
|
619 |
|
620 // First transform rows |
|
621 for (i = 0; i < 16; ++i) { |
|
622 idct16_1d(input, outptr); |
|
623 input += 16; |
|
624 outptr += 16; |
|
625 } |
|
626 |
|
627 // Then transform columns |
|
628 for (i = 0; i < 16; ++i) { |
|
629 for (j = 0; j < 16; ++j) |
|
630 temp_in[j] = out[j * 16 + i]; |
|
631 idct16_1d(temp_in, temp_out); |
|
632 for (j = 0; j < 16; ++j) |
|
633 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) |
|
634 + dest[j * stride + i]); |
|
635 } |
|
636 } |
|
637 |
|
638 static void iadst16_1d(const int16_t *input, int16_t *output) { |
|
639 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; |
|
640 |
|
641 int x0 = input[15]; |
|
642 int x1 = input[0]; |
|
643 int x2 = input[13]; |
|
644 int x3 = input[2]; |
|
645 int x4 = input[11]; |
|
646 int x5 = input[4]; |
|
647 int x6 = input[9]; |
|
648 int x7 = input[6]; |
|
649 int x8 = input[7]; |
|
650 int x9 = input[8]; |
|
651 int x10 = input[5]; |
|
652 int x11 = input[10]; |
|
653 int x12 = input[3]; |
|
654 int x13 = input[12]; |
|
655 int x14 = input[1]; |
|
656 int x15 = input[14]; |
|
657 |
|
658 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 |
|
659 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { |
|
660 output[0] = output[1] = output[2] = output[3] = output[4] |
|
661 = output[5] = output[6] = output[7] = output[8] |
|
662 = output[9] = output[10] = output[11] = output[12] |
|
663 = output[13] = output[14] = output[15] = 0; |
|
664 return; |
|
665 } |
|
666 |
|
667 // stage 1 |
|
668 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; |
|
669 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; |
|
670 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; |
|
671 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; |
|
672 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; |
|
673 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; |
|
674 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; |
|
675 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; |
|
676 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; |
|
677 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; |
|
678 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; |
|
679 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; |
|
680 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; |
|
681 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; |
|
682 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; |
|
683 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; |
|
684 |
|
685 x0 = dct_const_round_shift(s0 + s8); |
|
686 x1 = dct_const_round_shift(s1 + s9); |
|
687 x2 = dct_const_round_shift(s2 + s10); |
|
688 x3 = dct_const_round_shift(s3 + s11); |
|
689 x4 = dct_const_round_shift(s4 + s12); |
|
690 x5 = dct_const_round_shift(s5 + s13); |
|
691 x6 = dct_const_round_shift(s6 + s14); |
|
692 x7 = dct_const_round_shift(s7 + s15); |
|
693 x8 = dct_const_round_shift(s0 - s8); |
|
694 x9 = dct_const_round_shift(s1 - s9); |
|
695 x10 = dct_const_round_shift(s2 - s10); |
|
696 x11 = dct_const_round_shift(s3 - s11); |
|
697 x12 = dct_const_round_shift(s4 - s12); |
|
698 x13 = dct_const_round_shift(s5 - s13); |
|
699 x14 = dct_const_round_shift(s6 - s14); |
|
700 x15 = dct_const_round_shift(s7 - s15); |
|
701 |
|
702 // stage 2 |
|
703 s0 = x0; |
|
704 s1 = x1; |
|
705 s2 = x2; |
|
706 s3 = x3; |
|
707 s4 = x4; |
|
708 s5 = x5; |
|
709 s6 = x6; |
|
710 s7 = x7; |
|
711 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; |
|
712 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; |
|
713 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; |
|
714 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; |
|
715 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; |
|
716 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; |
|
717 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; |
|
718 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; |
|
719 |
|
720 x0 = s0 + s4; |
|
721 x1 = s1 + s5; |
|
722 x2 = s2 + s6; |
|
723 x3 = s3 + s7; |
|
724 x4 = s0 - s4; |
|
725 x5 = s1 - s5; |
|
726 x6 = s2 - s6; |
|
727 x7 = s3 - s7; |
|
728 x8 = dct_const_round_shift(s8 + s12); |
|
729 x9 = dct_const_round_shift(s9 + s13); |
|
730 x10 = dct_const_round_shift(s10 + s14); |
|
731 x11 = dct_const_round_shift(s11 + s15); |
|
732 x12 = dct_const_round_shift(s8 - s12); |
|
733 x13 = dct_const_round_shift(s9 - s13); |
|
734 x14 = dct_const_round_shift(s10 - s14); |
|
735 x15 = dct_const_round_shift(s11 - s15); |
|
736 |
|
737 // stage 3 |
|
738 s0 = x0; |
|
739 s1 = x1; |
|
740 s2 = x2; |
|
741 s3 = x3; |
|
742 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; |
|
743 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; |
|
744 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; |
|
745 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; |
|
746 s8 = x8; |
|
747 s9 = x9; |
|
748 s10 = x10; |
|
749 s11 = x11; |
|
750 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; |
|
751 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; |
|
752 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; |
|
753 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; |
|
754 |
|
755 x0 = s0 + s2; |
|
756 x1 = s1 + s3; |
|
757 x2 = s0 - s2; |
|
758 x3 = s1 - s3; |
|
759 x4 = dct_const_round_shift(s4 + s6); |
|
760 x5 = dct_const_round_shift(s5 + s7); |
|
761 x6 = dct_const_round_shift(s4 - s6); |
|
762 x7 = dct_const_round_shift(s5 - s7); |
|
763 x8 = s8 + s10; |
|
764 x9 = s9 + s11; |
|
765 x10 = s8 - s10; |
|
766 x11 = s9 - s11; |
|
767 x12 = dct_const_round_shift(s12 + s14); |
|
768 x13 = dct_const_round_shift(s13 + s15); |
|
769 x14 = dct_const_round_shift(s12 - s14); |
|
770 x15 = dct_const_round_shift(s13 - s15); |
|
771 |
|
772 // stage 4 |
|
773 s2 = (- cospi_16_64) * (x2 + x3); |
|
774 s3 = cospi_16_64 * (x2 - x3); |
|
775 s6 = cospi_16_64 * (x6 + x7); |
|
776 s7 = cospi_16_64 * (- x6 + x7); |
|
777 s10 = cospi_16_64 * (x10 + x11); |
|
778 s11 = cospi_16_64 * (- x10 + x11); |
|
779 s14 = (- cospi_16_64) * (x14 + x15); |
|
780 s15 = cospi_16_64 * (x14 - x15); |
|
781 |
|
782 x2 = dct_const_round_shift(s2); |
|
783 x3 = dct_const_round_shift(s3); |
|
784 x6 = dct_const_round_shift(s6); |
|
785 x7 = dct_const_round_shift(s7); |
|
786 x10 = dct_const_round_shift(s10); |
|
787 x11 = dct_const_round_shift(s11); |
|
788 x14 = dct_const_round_shift(s14); |
|
789 x15 = dct_const_round_shift(s15); |
|
790 |
|
791 output[0] = x0; |
|
792 output[1] = -x8; |
|
793 output[2] = x12; |
|
794 output[3] = -x4; |
|
795 output[4] = x6; |
|
796 output[5] = x14; |
|
797 output[6] = x10; |
|
798 output[7] = x2; |
|
799 output[8] = x3; |
|
800 output[9] = x11; |
|
801 output[10] = x15; |
|
802 output[11] = x7; |
|
803 output[12] = x5; |
|
804 output[13] = -x13; |
|
805 output[14] = x9; |
|
806 output[15] = -x1; |
|
807 } |
|
808 |
|
809 static const transform_2d IHT_16[] = { |
|
810 { idct16_1d, idct16_1d }, // DCT_DCT = 0 |
|
811 { iadst16_1d, idct16_1d }, // ADST_DCT = 1 |
|
812 { idct16_1d, iadst16_1d }, // DCT_ADST = 2 |
|
813 { iadst16_1d, iadst16_1d } // ADST_ADST = 3 |
|
814 }; |
|
815 |
|
816 void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride, |
|
817 int tx_type) { |
|
818 int i, j; |
|
819 int16_t out[16 * 16]; |
|
820 int16_t *outptr = out; |
|
821 int16_t temp_in[16], temp_out[16]; |
|
822 const transform_2d ht = IHT_16[tx_type]; |
|
823 |
|
824 // Rows |
|
825 for (i = 0; i < 16; ++i) { |
|
826 ht.rows(input, outptr); |
|
827 input += 16; |
|
828 outptr += 16; |
|
829 } |
|
830 |
|
831 // Columns |
|
832 for (i = 0; i < 16; ++i) { |
|
833 for (j = 0; j < 16; ++j) |
|
834 temp_in[j] = out[j * 16 + i]; |
|
835 ht.cols(temp_in, temp_out); |
|
836 for (j = 0; j < 16; ++j) |
|
837 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) |
|
838 + dest[j * stride + i]); |
|
839 } |
|
840 } |
|
841 |
|
842 void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) { |
|
843 int16_t out[16 * 16] = { 0 }; |
|
844 int16_t *outptr = out; |
|
845 int i, j; |
|
846 int16_t temp_in[16], temp_out[16]; |
|
847 |
|
848 // First transform rows. Since all non-zero dct coefficients are in |
|
849 // upper-left 4x4 area, we only need to calculate first 4 rows here. |
|
850 for (i = 0; i < 4; ++i) { |
|
851 idct16_1d(input, outptr); |
|
852 input += 16; |
|
853 outptr += 16; |
|
854 } |
|
855 |
|
856 // Then transform columns |
|
857 for (i = 0; i < 16; ++i) { |
|
858 for (j = 0; j < 16; ++j) |
|
859 temp_in[j] = out[j*16 + i]; |
|
860 idct16_1d(temp_in, temp_out); |
|
861 for (j = 0; j < 16; ++j) |
|
862 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) |
|
863 + dest[j * stride + i]); |
|
864 } |
|
865 } |
|
866 |
|
867 void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) { |
|
868 int i, j; |
|
869 int a1; |
|
870 int16_t out = dct_const_round_shift(input[0] * cospi_16_64); |
|
871 out = dct_const_round_shift(out * cospi_16_64); |
|
872 a1 = ROUND_POWER_OF_TWO(out, 6); |
|
873 for (j = 0; j < 16; ++j) { |
|
874 for (i = 0; i < 16; ++i) |
|
875 dest[i] = clip_pixel(dest[i] + a1); |
|
876 dest += stride; |
|
877 } |
|
878 } |
|
879 |
|
880 static void idct32_1d(const int16_t *input, int16_t *output) { |
|
881 int16_t step1[32], step2[32]; |
|
882 int temp1, temp2; |
|
883 |
|
884 // stage 1 |
|
885 step1[0] = input[0]; |
|
886 step1[1] = input[16]; |
|
887 step1[2] = input[8]; |
|
888 step1[3] = input[24]; |
|
889 step1[4] = input[4]; |
|
890 step1[5] = input[20]; |
|
891 step1[6] = input[12]; |
|
892 step1[7] = input[28]; |
|
893 step1[8] = input[2]; |
|
894 step1[9] = input[18]; |
|
895 step1[10] = input[10]; |
|
896 step1[11] = input[26]; |
|
897 step1[12] = input[6]; |
|
898 step1[13] = input[22]; |
|
899 step1[14] = input[14]; |
|
900 step1[15] = input[30]; |
|
901 |
|
902 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; |
|
903 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; |
|
904 step1[16] = dct_const_round_shift(temp1); |
|
905 step1[31] = dct_const_round_shift(temp2); |
|
906 |
|
907 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; |
|
908 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; |
|
909 step1[17] = dct_const_round_shift(temp1); |
|
910 step1[30] = dct_const_round_shift(temp2); |
|
911 |
|
912 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; |
|
913 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; |
|
914 step1[18] = dct_const_round_shift(temp1); |
|
915 step1[29] = dct_const_round_shift(temp2); |
|
916 |
|
917 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; |
|
918 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; |
|
919 step1[19] = dct_const_round_shift(temp1); |
|
920 step1[28] = dct_const_round_shift(temp2); |
|
921 |
|
922 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; |
|
923 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; |
|
924 step1[20] = dct_const_round_shift(temp1); |
|
925 step1[27] = dct_const_round_shift(temp2); |
|
926 |
|
927 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; |
|
928 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; |
|
929 step1[21] = dct_const_round_shift(temp1); |
|
930 step1[26] = dct_const_round_shift(temp2); |
|
931 |
|
932 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; |
|
933 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; |
|
934 step1[22] = dct_const_round_shift(temp1); |
|
935 step1[25] = dct_const_round_shift(temp2); |
|
936 |
|
937 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; |
|
938 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; |
|
939 step1[23] = dct_const_round_shift(temp1); |
|
940 step1[24] = dct_const_round_shift(temp2); |
|
941 |
|
942 // stage 2 |
|
943 step2[0] = step1[0]; |
|
944 step2[1] = step1[1]; |
|
945 step2[2] = step1[2]; |
|
946 step2[3] = step1[3]; |
|
947 step2[4] = step1[4]; |
|
948 step2[5] = step1[5]; |
|
949 step2[6] = step1[6]; |
|
950 step2[7] = step1[7]; |
|
951 |
|
952 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; |
|
953 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; |
|
954 step2[8] = dct_const_round_shift(temp1); |
|
955 step2[15] = dct_const_round_shift(temp2); |
|
956 |
|
957 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; |
|
958 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; |
|
959 step2[9] = dct_const_round_shift(temp1); |
|
960 step2[14] = dct_const_round_shift(temp2); |
|
961 |
|
962 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; |
|
963 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; |
|
964 step2[10] = dct_const_round_shift(temp1); |
|
965 step2[13] = dct_const_round_shift(temp2); |
|
966 |
|
967 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; |
|
968 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; |
|
969 step2[11] = dct_const_round_shift(temp1); |
|
970 step2[12] = dct_const_round_shift(temp2); |
|
971 |
|
972 step2[16] = step1[16] + step1[17]; |
|
973 step2[17] = step1[16] - step1[17]; |
|
974 step2[18] = -step1[18] + step1[19]; |
|
975 step2[19] = step1[18] + step1[19]; |
|
976 step2[20] = step1[20] + step1[21]; |
|
977 step2[21] = step1[20] - step1[21]; |
|
978 step2[22] = -step1[22] + step1[23]; |
|
979 step2[23] = step1[22] + step1[23]; |
|
980 step2[24] = step1[24] + step1[25]; |
|
981 step2[25] = step1[24] - step1[25]; |
|
982 step2[26] = -step1[26] + step1[27]; |
|
983 step2[27] = step1[26] + step1[27]; |
|
984 step2[28] = step1[28] + step1[29]; |
|
985 step2[29] = step1[28] - step1[29]; |
|
986 step2[30] = -step1[30] + step1[31]; |
|
987 step2[31] = step1[30] + step1[31]; |
|
988 |
|
989 // stage 3 |
|
990 step1[0] = step2[0]; |
|
991 step1[1] = step2[1]; |
|
992 step1[2] = step2[2]; |
|
993 step1[3] = step2[3]; |
|
994 |
|
995 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; |
|
996 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; |
|
997 step1[4] = dct_const_round_shift(temp1); |
|
998 step1[7] = dct_const_round_shift(temp2); |
|
999 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; |
|
1000 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; |
|
1001 step1[5] = dct_const_round_shift(temp1); |
|
1002 step1[6] = dct_const_round_shift(temp2); |
|
1003 |
|
1004 step1[8] = step2[8] + step2[9]; |
|
1005 step1[9] = step2[8] - step2[9]; |
|
1006 step1[10] = -step2[10] + step2[11]; |
|
1007 step1[11] = step2[10] + step2[11]; |
|
1008 step1[12] = step2[12] + step2[13]; |
|
1009 step1[13] = step2[12] - step2[13]; |
|
1010 step1[14] = -step2[14] + step2[15]; |
|
1011 step1[15] = step2[14] + step2[15]; |
|
1012 |
|
1013 step1[16] = step2[16]; |
|
1014 step1[31] = step2[31]; |
|
1015 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; |
|
1016 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; |
|
1017 step1[17] = dct_const_round_shift(temp1); |
|
1018 step1[30] = dct_const_round_shift(temp2); |
|
1019 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; |
|
1020 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; |
|
1021 step1[18] = dct_const_round_shift(temp1); |
|
1022 step1[29] = dct_const_round_shift(temp2); |
|
1023 step1[19] = step2[19]; |
|
1024 step1[20] = step2[20]; |
|
1025 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; |
|
1026 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; |
|
1027 step1[21] = dct_const_round_shift(temp1); |
|
1028 step1[26] = dct_const_round_shift(temp2); |
|
1029 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; |
|
1030 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; |
|
1031 step1[22] = dct_const_round_shift(temp1); |
|
1032 step1[25] = dct_const_round_shift(temp2); |
|
1033 step1[23] = step2[23]; |
|
1034 step1[24] = step2[24]; |
|
1035 step1[27] = step2[27]; |
|
1036 step1[28] = step2[28]; |
|
1037 |
|
1038 // stage 4 |
|
1039 temp1 = (step1[0] + step1[1]) * cospi_16_64; |
|
1040 temp2 = (step1[0] - step1[1]) * cospi_16_64; |
|
1041 step2[0] = dct_const_round_shift(temp1); |
|
1042 step2[1] = dct_const_round_shift(temp2); |
|
1043 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; |
|
1044 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; |
|
1045 step2[2] = dct_const_round_shift(temp1); |
|
1046 step2[3] = dct_const_round_shift(temp2); |
|
1047 step2[4] = step1[4] + step1[5]; |
|
1048 step2[5] = step1[4] - step1[5]; |
|
1049 step2[6] = -step1[6] + step1[7]; |
|
1050 step2[7] = step1[6] + step1[7]; |
|
1051 |
|
1052 step2[8] = step1[8]; |
|
1053 step2[15] = step1[15]; |
|
1054 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; |
|
1055 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; |
|
1056 step2[9] = dct_const_round_shift(temp1); |
|
1057 step2[14] = dct_const_round_shift(temp2); |
|
1058 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; |
|
1059 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; |
|
1060 step2[10] = dct_const_round_shift(temp1); |
|
1061 step2[13] = dct_const_round_shift(temp2); |
|
1062 step2[11] = step1[11]; |
|
1063 step2[12] = step1[12]; |
|
1064 |
|
1065 step2[16] = step1[16] + step1[19]; |
|
1066 step2[17] = step1[17] + step1[18]; |
|
1067 step2[18] = step1[17] - step1[18]; |
|
1068 step2[19] = step1[16] - step1[19]; |
|
1069 step2[20] = -step1[20] + step1[23]; |
|
1070 step2[21] = -step1[21] + step1[22]; |
|
1071 step2[22] = step1[21] + step1[22]; |
|
1072 step2[23] = step1[20] + step1[23]; |
|
1073 |
|
1074 step2[24] = step1[24] + step1[27]; |
|
1075 step2[25] = step1[25] + step1[26]; |
|
1076 step2[26] = step1[25] - step1[26]; |
|
1077 step2[27] = step1[24] - step1[27]; |
|
1078 step2[28] = -step1[28] + step1[31]; |
|
1079 step2[29] = -step1[29] + step1[30]; |
|
1080 step2[30] = step1[29] + step1[30]; |
|
1081 step2[31] = step1[28] + step1[31]; |
|
1082 |
|
1083 // stage 5 |
|
1084 step1[0] = step2[0] + step2[3]; |
|
1085 step1[1] = step2[1] + step2[2]; |
|
1086 step1[2] = step2[1] - step2[2]; |
|
1087 step1[3] = step2[0] - step2[3]; |
|
1088 step1[4] = step2[4]; |
|
1089 temp1 = (step2[6] - step2[5]) * cospi_16_64; |
|
1090 temp2 = (step2[5] + step2[6]) * cospi_16_64; |
|
1091 step1[5] = dct_const_round_shift(temp1); |
|
1092 step1[6] = dct_const_round_shift(temp2); |
|
1093 step1[7] = step2[7]; |
|
1094 |
|
1095 step1[8] = step2[8] + step2[11]; |
|
1096 step1[9] = step2[9] + step2[10]; |
|
1097 step1[10] = step2[9] - step2[10]; |
|
1098 step1[11] = step2[8] - step2[11]; |
|
1099 step1[12] = -step2[12] + step2[15]; |
|
1100 step1[13] = -step2[13] + step2[14]; |
|
1101 step1[14] = step2[13] + step2[14]; |
|
1102 step1[15] = step2[12] + step2[15]; |
|
1103 |
|
1104 step1[16] = step2[16]; |
|
1105 step1[17] = step2[17]; |
|
1106 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; |
|
1107 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; |
|
1108 step1[18] = dct_const_round_shift(temp1); |
|
1109 step1[29] = dct_const_round_shift(temp2); |
|
1110 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; |
|
1111 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; |
|
1112 step1[19] = dct_const_round_shift(temp1); |
|
1113 step1[28] = dct_const_round_shift(temp2); |
|
1114 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; |
|
1115 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; |
|
1116 step1[20] = dct_const_round_shift(temp1); |
|
1117 step1[27] = dct_const_round_shift(temp2); |
|
1118 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; |
|
1119 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; |
|
1120 step1[21] = dct_const_round_shift(temp1); |
|
1121 step1[26] = dct_const_round_shift(temp2); |
|
1122 step1[22] = step2[22]; |
|
1123 step1[23] = step2[23]; |
|
1124 step1[24] = step2[24]; |
|
1125 step1[25] = step2[25]; |
|
1126 step1[30] = step2[30]; |
|
1127 step1[31] = step2[31]; |
|
1128 |
|
1129 // stage 6 |
|
1130 step2[0] = step1[0] + step1[7]; |
|
1131 step2[1] = step1[1] + step1[6]; |
|
1132 step2[2] = step1[2] + step1[5]; |
|
1133 step2[3] = step1[3] + step1[4]; |
|
1134 step2[4] = step1[3] - step1[4]; |
|
1135 step2[5] = step1[2] - step1[5]; |
|
1136 step2[6] = step1[1] - step1[6]; |
|
1137 step2[7] = step1[0] - step1[7]; |
|
1138 step2[8] = step1[8]; |
|
1139 step2[9] = step1[9]; |
|
1140 temp1 = (-step1[10] + step1[13]) * cospi_16_64; |
|
1141 temp2 = (step1[10] + step1[13]) * cospi_16_64; |
|
1142 step2[10] = dct_const_round_shift(temp1); |
|
1143 step2[13] = dct_const_round_shift(temp2); |
|
1144 temp1 = (-step1[11] + step1[12]) * cospi_16_64; |
|
1145 temp2 = (step1[11] + step1[12]) * cospi_16_64; |
|
1146 step2[11] = dct_const_round_shift(temp1); |
|
1147 step2[12] = dct_const_round_shift(temp2); |
|
1148 step2[14] = step1[14]; |
|
1149 step2[15] = step1[15]; |
|
1150 |
|
1151 step2[16] = step1[16] + step1[23]; |
|
1152 step2[17] = step1[17] + step1[22]; |
|
1153 step2[18] = step1[18] + step1[21]; |
|
1154 step2[19] = step1[19] + step1[20]; |
|
1155 step2[20] = step1[19] - step1[20]; |
|
1156 step2[21] = step1[18] - step1[21]; |
|
1157 step2[22] = step1[17] - step1[22]; |
|
1158 step2[23] = step1[16] - step1[23]; |
|
1159 |
|
1160 step2[24] = -step1[24] + step1[31]; |
|
1161 step2[25] = -step1[25] + step1[30]; |
|
1162 step2[26] = -step1[26] + step1[29]; |
|
1163 step2[27] = -step1[27] + step1[28]; |
|
1164 step2[28] = step1[27] + step1[28]; |
|
1165 step2[29] = step1[26] + step1[29]; |
|
1166 step2[30] = step1[25] + step1[30]; |
|
1167 step2[31] = step1[24] + step1[31]; |
|
1168 |
|
1169 // stage 7 |
|
1170 step1[0] = step2[0] + step2[15]; |
|
1171 step1[1] = step2[1] + step2[14]; |
|
1172 step1[2] = step2[2] + step2[13]; |
|
1173 step1[3] = step2[3] + step2[12]; |
|
1174 step1[4] = step2[4] + step2[11]; |
|
1175 step1[5] = step2[5] + step2[10]; |
|
1176 step1[6] = step2[6] + step2[9]; |
|
1177 step1[7] = step2[7] + step2[8]; |
|
1178 step1[8] = step2[7] - step2[8]; |
|
1179 step1[9] = step2[6] - step2[9]; |
|
1180 step1[10] = step2[5] - step2[10]; |
|
1181 step1[11] = step2[4] - step2[11]; |
|
1182 step1[12] = step2[3] - step2[12]; |
|
1183 step1[13] = step2[2] - step2[13]; |
|
1184 step1[14] = step2[1] - step2[14]; |
|
1185 step1[15] = step2[0] - step2[15]; |
|
1186 |
|
1187 step1[16] = step2[16]; |
|
1188 step1[17] = step2[17]; |
|
1189 step1[18] = step2[18]; |
|
1190 step1[19] = step2[19]; |
|
1191 temp1 = (-step2[20] + step2[27]) * cospi_16_64; |
|
1192 temp2 = (step2[20] + step2[27]) * cospi_16_64; |
|
1193 step1[20] = dct_const_round_shift(temp1); |
|
1194 step1[27] = dct_const_round_shift(temp2); |
|
1195 temp1 = (-step2[21] + step2[26]) * cospi_16_64; |
|
1196 temp2 = (step2[21] + step2[26]) * cospi_16_64; |
|
1197 step1[21] = dct_const_round_shift(temp1); |
|
1198 step1[26] = dct_const_round_shift(temp2); |
|
1199 temp1 = (-step2[22] + step2[25]) * cospi_16_64; |
|
1200 temp2 = (step2[22] + step2[25]) * cospi_16_64; |
|
1201 step1[22] = dct_const_round_shift(temp1); |
|
1202 step1[25] = dct_const_round_shift(temp2); |
|
1203 temp1 = (-step2[23] + step2[24]) * cospi_16_64; |
|
1204 temp2 = (step2[23] + step2[24]) * cospi_16_64; |
|
1205 step1[23] = dct_const_round_shift(temp1); |
|
1206 step1[24] = dct_const_round_shift(temp2); |
|
1207 step1[28] = step2[28]; |
|
1208 step1[29] = step2[29]; |
|
1209 step1[30] = step2[30]; |
|
1210 step1[31] = step2[31]; |
|
1211 |
|
1212 // final stage |
|
1213 output[0] = step1[0] + step1[31]; |
|
1214 output[1] = step1[1] + step1[30]; |
|
1215 output[2] = step1[2] + step1[29]; |
|
1216 output[3] = step1[3] + step1[28]; |
|
1217 output[4] = step1[4] + step1[27]; |
|
1218 output[5] = step1[5] + step1[26]; |
|
1219 output[6] = step1[6] + step1[25]; |
|
1220 output[7] = step1[7] + step1[24]; |
|
1221 output[8] = step1[8] + step1[23]; |
|
1222 output[9] = step1[9] + step1[22]; |
|
1223 output[10] = step1[10] + step1[21]; |
|
1224 output[11] = step1[11] + step1[20]; |
|
1225 output[12] = step1[12] + step1[19]; |
|
1226 output[13] = step1[13] + step1[18]; |
|
1227 output[14] = step1[14] + step1[17]; |
|
1228 output[15] = step1[15] + step1[16]; |
|
1229 output[16] = step1[15] - step1[16]; |
|
1230 output[17] = step1[14] - step1[17]; |
|
1231 output[18] = step1[13] - step1[18]; |
|
1232 output[19] = step1[12] - step1[19]; |
|
1233 output[20] = step1[11] - step1[20]; |
|
1234 output[21] = step1[10] - step1[21]; |
|
1235 output[22] = step1[9] - step1[22]; |
|
1236 output[23] = step1[8] - step1[23]; |
|
1237 output[24] = step1[7] - step1[24]; |
|
1238 output[25] = step1[6] - step1[25]; |
|
1239 output[26] = step1[5] - step1[26]; |
|
1240 output[27] = step1[4] - step1[27]; |
|
1241 output[28] = step1[3] - step1[28]; |
|
1242 output[29] = step1[2] - step1[29]; |
|
1243 output[30] = step1[1] - step1[30]; |
|
1244 output[31] = step1[0] - step1[31]; |
|
1245 } |
|
1246 |
|
1247 void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) { |
|
1248 int16_t out[32 * 32]; |
|
1249 int16_t *outptr = out; |
|
1250 int i, j; |
|
1251 int16_t temp_in[32], temp_out[32]; |
|
1252 |
|
1253 // Rows |
|
1254 for (i = 0; i < 32; ++i) { |
|
1255 int16_t zero_coeff[16]; |
|
1256 for (j = 0; j < 16; ++j) |
|
1257 zero_coeff[j] = input[2 * j] | input[2 * j + 1]; |
|
1258 for (j = 0; j < 8; ++j) |
|
1259 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; |
|
1260 for (j = 0; j < 4; ++j) |
|
1261 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; |
|
1262 for (j = 0; j < 2; ++j) |
|
1263 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; |
|
1264 |
|
1265 if (zero_coeff[0] | zero_coeff[1]) |
|
1266 idct32_1d(input, outptr); |
|
1267 else |
|
1268 vpx_memset(outptr, 0, sizeof(int16_t) * 32); |
|
1269 input += 32; |
|
1270 outptr += 32; |
|
1271 } |
|
1272 |
|
1273 // Columns |
|
1274 for (i = 0; i < 32; ++i) { |
|
1275 for (j = 0; j < 32; ++j) |
|
1276 temp_in[j] = out[j * 32 + i]; |
|
1277 idct32_1d(temp_in, temp_out); |
|
1278 for (j = 0; j < 32; ++j) |
|
1279 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) |
|
1280 + dest[j * stride + i]); |
|
1281 } |
|
1282 } |
|
1283 |
|
1284 void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) { |
|
1285 int16_t out[32 * 32] = {0}; |
|
1286 int16_t *outptr = out; |
|
1287 int i, j; |
|
1288 int16_t temp_in[32], temp_out[32]; |
|
1289 |
|
1290 // Rows |
|
1291 // only upper-left 8x8 has non-zero coeff |
|
1292 for (i = 0; i < 8; ++i) { |
|
1293 idct32_1d(input, outptr); |
|
1294 input += 32; |
|
1295 outptr += 32; |
|
1296 } |
|
1297 |
|
1298 // Columns |
|
1299 for (i = 0; i < 32; ++i) { |
|
1300 for (j = 0; j < 32; ++j) |
|
1301 temp_in[j] = out[j * 32 + i]; |
|
1302 idct32_1d(temp_in, temp_out); |
|
1303 for (j = 0; j < 32; ++j) |
|
1304 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) |
|
1305 + dest[j * stride + i]); |
|
1306 } |
|
1307 } |
|
1308 |
|
1309 void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) { |
|
1310 int i, j; |
|
1311 int a1; |
|
1312 |
|
1313 int16_t out = dct_const_round_shift(input[0] * cospi_16_64); |
|
1314 out = dct_const_round_shift(out * cospi_16_64); |
|
1315 a1 = ROUND_POWER_OF_TWO(out, 6); |
|
1316 |
|
1317 for (j = 0; j < 32; ++j) { |
|
1318 for (i = 0; i < 32; ++i) |
|
1319 dest[i] = clip_pixel(dest[i] + a1); |
|
1320 dest += stride; |
|
1321 } |
|
1322 } |
|
1323 |
|
1324 // idct |
|
1325 void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) { |
|
1326 if (eob > 1) |
|
1327 vp9_idct4x4_16_add(input, dest, stride); |
|
1328 else |
|
1329 vp9_idct4x4_1_add(input, dest, stride); |
|
1330 } |
|
1331 |
|
1332 |
|
1333 void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) { |
|
1334 if (eob > 1) |
|
1335 vp9_iwht4x4_16_add(input, dest, stride); |
|
1336 else |
|
1337 vp9_iwht4x4_1_add(input, dest, stride); |
|
1338 } |
|
1339 |
|
1340 void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) { |
|
1341 // If dc is 1, then input[0] is the reconstructed value, do not need |
|
1342 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. |
|
1343 |
|
1344 // The calculation can be simplified if there are not many non-zero dct |
|
1345 // coefficients. Use eobs to decide what to do. |
|
1346 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. |
|
1347 // Combine that with code here. |
|
1348 if (eob) { |
|
1349 if (eob == 1) |
|
1350 // DC only DCT coefficient |
|
1351 vp9_idct8x8_1_add(input, dest, stride); |
|
1352 else if (eob <= 10) |
|
1353 vp9_idct8x8_10_add(input, dest, stride); |
|
1354 else |
|
1355 vp9_idct8x8_64_add(input, dest, stride); |
|
1356 } |
|
1357 } |
|
1358 |
|
1359 void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, |
|
1360 int eob) { |
|
1361 /* The calculation can be simplified if there are not many non-zero dct |
|
1362 * coefficients. Use eobs to separate different cases. */ |
|
1363 if (eob) { |
|
1364 if (eob == 1) |
|
1365 /* DC only DCT coefficient. */ |
|
1366 vp9_idct16x16_1_add(input, dest, stride); |
|
1367 else if (eob <= 10) |
|
1368 vp9_idct16x16_10_add(input, dest, stride); |
|
1369 else |
|
1370 vp9_idct16x16_256_add(input, dest, stride); |
|
1371 } |
|
1372 } |
|
1373 |
|
1374 void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride, |
|
1375 int eob) { |
|
1376 if (eob) { |
|
1377 if (eob == 1) |
|
1378 vp9_idct32x32_1_add(input, dest, stride); |
|
1379 else if (eob <= 34) |
|
1380 // non-zero coeff only in upper-left 8x8 |
|
1381 vp9_idct32x32_34_add(input, dest, stride); |
|
1382 else |
|
1383 vp9_idct32x32_1024_add(input, dest, stride); |
|
1384 } |
|
1385 } |
|
1386 |
|
1387 // iht |
|
1388 void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, |
|
1389 int stride, int eob) { |
|
1390 if (tx_type == DCT_DCT) |
|
1391 vp9_idct4x4_add(input, dest, stride, eob); |
|
1392 else |
|
1393 vp9_iht4x4_16_add(input, dest, stride, tx_type); |
|
1394 } |
|
1395 |
|
1396 void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, |
|
1397 int stride, int eob) { |
|
1398 if (tx_type == DCT_DCT) { |
|
1399 vp9_idct8x8_add(input, dest, stride, eob); |
|
1400 } else { |
|
1401 if (eob > 0) { |
|
1402 vp9_iht8x8_64_add(input, dest, stride, tx_type); |
|
1403 } |
|
1404 } |
|
1405 } |
|
1406 |
|
1407 void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, |
|
1408 int stride, int eob) { |
|
1409 if (tx_type == DCT_DCT) { |
|
1410 vp9_idct16x16_add(input, dest, stride, eob); |
|
1411 } else { |
|
1412 if (eob > 0) { |
|
1413 vp9_iht16x16_256_add(input, dest, stride, tx_type); |
|
1414 } |
|
1415 } |
|
1416 } |