michael@0: /******************************************************************** michael@0: * * michael@0: * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * michael@0: * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * michael@0: * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * michael@0: * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * michael@0: * * michael@0: * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * michael@0: * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * michael@0: * * michael@0: ******************************************************************** michael@0: michael@0: function: michael@0: last mod: $Id: idct.c 17410 2010-09-21 21:53:48Z tterribe $ michael@0: michael@0: ********************************************************************/ michael@0: michael@0: #include michael@0: #include "internal.h" michael@0: #include "dct.h" michael@0: michael@0: /*Performs an inverse 8 point Type-II DCT transform. michael@0: The output is scaled by a factor of 2 relative to the orthonormal version of michael@0: the transform. michael@0: _y: The buffer to store the result in. michael@0: Data will be placed in every 8th entry (e.g., in a column of an 8x8 michael@0: block). michael@0: _x: The input coefficients. michael@0: The first 8 entries are used (e.g., from a row of an 8x8 block).*/ michael@0: static void idct8(ogg_int16_t *_y,const ogg_int16_t _x[8]){ michael@0: ogg_int32_t t[8]; michael@0: ogg_int32_t r; michael@0: /*Stage 1:*/ michael@0: /*0-1 butterfly.*/ michael@0: t[0]=OC_C4S4*(ogg_int16_t)(_x[0]+_x[4])>>16; michael@0: t[1]=OC_C4S4*(ogg_int16_t)(_x[0]-_x[4])>>16; michael@0: /*2-3 rotation by 6pi/16.*/ michael@0: t[2]=(OC_C6S2*_x[2]>>16)-(OC_C2S6*_x[6]>>16); michael@0: t[3]=(OC_C2S6*_x[2]>>16)+(OC_C6S2*_x[6]>>16); michael@0: /*4-7 rotation by 7pi/16.*/ michael@0: t[4]=(OC_C7S1*_x[1]>>16)-(OC_C1S7*_x[7]>>16); michael@0: /*5-6 rotation by 3pi/16.*/ michael@0: t[5]=(OC_C3S5*_x[5]>>16)-(OC_C5S3*_x[3]>>16); michael@0: t[6]=(OC_C5S3*_x[5]>>16)+(OC_C3S5*_x[3]>>16); michael@0: t[7]=(OC_C1S7*_x[1]>>16)+(OC_C7S1*_x[7]>>16); michael@0: /*Stage 2:*/ michael@0: /*4-5 butterfly.*/ michael@0: r=t[4]+t[5]; michael@0: t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16; michael@0: t[4]=r; michael@0: /*7-6 butterfly.*/ michael@0: r=t[7]+t[6]; michael@0: t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16; michael@0: t[7]=r; michael@0: /*Stage 3:*/ michael@0: /*0-3 butterfly.*/ michael@0: r=t[0]+t[3]; michael@0: t[3]=t[0]-t[3]; michael@0: t[0]=r; michael@0: /*1-2 butterfly.*/ michael@0: r=t[1]+t[2]; michael@0: t[2]=t[1]-t[2]; michael@0: t[1]=r; michael@0: /*6-5 butterfly.*/ michael@0: r=t[6]+t[5]; michael@0: t[5]=t[6]-t[5]; michael@0: t[6]=r; michael@0: /*Stage 4:*/ michael@0: /*0-7 butterfly.*/ michael@0: _y[0<<3]=(ogg_int16_t)(t[0]+t[7]); michael@0: /*1-6 butterfly.*/ michael@0: _y[1<<3]=(ogg_int16_t)(t[1]+t[6]); michael@0: /*2-5 butterfly.*/ michael@0: _y[2<<3]=(ogg_int16_t)(t[2]+t[5]); michael@0: /*3-4 butterfly.*/ michael@0: _y[3<<3]=(ogg_int16_t)(t[3]+t[4]); michael@0: _y[4<<3]=(ogg_int16_t)(t[3]-t[4]); michael@0: _y[5<<3]=(ogg_int16_t)(t[2]-t[5]); michael@0: _y[6<<3]=(ogg_int16_t)(t[1]-t[6]); michael@0: _y[7<<3]=(ogg_int16_t)(t[0]-t[7]); michael@0: } michael@0: michael@0: /*Performs an inverse 8 point Type-II DCT transform. michael@0: The output is scaled by a factor of 2 relative to the orthonormal version of michael@0: the transform. michael@0: _y: The buffer to store the result in. michael@0: Data will be placed in every 8th entry (e.g., in a column of an 8x8 michael@0: block). michael@0: _x: The input coefficients. michael@0: Only the first 4 entries are used. michael@0: The other 4 are assumed to be 0.*/ michael@0: static void idct8_4(ogg_int16_t *_y,const ogg_int16_t _x[8]){ michael@0: ogg_int32_t t[8]; michael@0: ogg_int32_t r; michael@0: /*Stage 1:*/ michael@0: t[0]=OC_C4S4*_x[0]>>16; michael@0: t[2]=OC_C6S2*_x[2]>>16; michael@0: t[3]=OC_C2S6*_x[2]>>16; michael@0: t[4]=OC_C7S1*_x[1]>>16; michael@0: t[5]=-(OC_C5S3*_x[3]>>16); michael@0: t[6]=OC_C3S5*_x[3]>>16; michael@0: t[7]=OC_C1S7*_x[1]>>16; michael@0: /*Stage 2:*/ michael@0: r=t[4]+t[5]; michael@0: t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16; michael@0: t[4]=r; michael@0: r=t[7]+t[6]; michael@0: t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16; michael@0: t[7]=r; michael@0: /*Stage 3:*/ michael@0: t[1]=t[0]+t[2]; michael@0: t[2]=t[0]-t[2]; michael@0: r=t[0]+t[3]; michael@0: t[3]=t[0]-t[3]; michael@0: t[0]=r; michael@0: r=t[6]+t[5]; michael@0: t[5]=t[6]-t[5]; michael@0: t[6]=r; michael@0: /*Stage 4:*/ michael@0: _y[0<<3]=(ogg_int16_t)(t[0]+t[7]); michael@0: _y[1<<3]=(ogg_int16_t)(t[1]+t[6]); michael@0: _y[2<<3]=(ogg_int16_t)(t[2]+t[5]); michael@0: _y[3<<3]=(ogg_int16_t)(t[3]+t[4]); michael@0: _y[4<<3]=(ogg_int16_t)(t[3]-t[4]); michael@0: _y[5<<3]=(ogg_int16_t)(t[2]-t[5]); michael@0: _y[6<<3]=(ogg_int16_t)(t[1]-t[6]); michael@0: _y[7<<3]=(ogg_int16_t)(t[0]-t[7]); michael@0: } michael@0: michael@0: /*Performs an inverse 8 point Type-II DCT transform. michael@0: The output is scaled by a factor of 2 relative to the orthonormal version of michael@0: the transform. michael@0: _y: The buffer to store the result in. michael@0: Data will be placed in every 8th entry (e.g., in a column of an 8x8 michael@0: block). michael@0: _x: The input coefficients. michael@0: Only the first 3 entries are used. michael@0: The other 5 are assumed to be 0.*/ michael@0: static void idct8_3(ogg_int16_t *_y,const ogg_int16_t _x[8]){ michael@0: ogg_int32_t t[8]; michael@0: ogg_int32_t r; michael@0: /*Stage 1:*/ michael@0: t[0]=OC_C4S4*_x[0]>>16; michael@0: t[2]=OC_C6S2*_x[2]>>16; michael@0: t[3]=OC_C2S6*_x[2]>>16; michael@0: t[4]=OC_C7S1*_x[1]>>16; michael@0: t[7]=OC_C1S7*_x[1]>>16; michael@0: /*Stage 2:*/ michael@0: t[5]=OC_C4S4*t[4]>>16; michael@0: t[6]=OC_C4S4*t[7]>>16; michael@0: /*Stage 3:*/ michael@0: t[1]=t[0]+t[2]; michael@0: t[2]=t[0]-t[2]; michael@0: r=t[0]+t[3]; michael@0: t[3]=t[0]-t[3]; michael@0: t[0]=r; michael@0: r=t[6]+t[5]; michael@0: t[5]=t[6]-t[5]; michael@0: t[6]=r; michael@0: /*Stage 4:*/ michael@0: _y[0<<3]=(ogg_int16_t)(t[0]+t[7]); michael@0: _y[1<<3]=(ogg_int16_t)(t[1]+t[6]); michael@0: _y[2<<3]=(ogg_int16_t)(t[2]+t[5]); michael@0: _y[3<<3]=(ogg_int16_t)(t[3]+t[4]); michael@0: _y[4<<3]=(ogg_int16_t)(t[3]-t[4]); michael@0: _y[5<<3]=(ogg_int16_t)(t[2]-t[5]); michael@0: _y[6<<3]=(ogg_int16_t)(t[1]-t[6]); michael@0: _y[7<<3]=(ogg_int16_t)(t[0]-t[7]); michael@0: } michael@0: michael@0: /*Performs an inverse 8 point Type-II DCT transform. michael@0: The output is scaled by a factor of 2 relative to the orthonormal version of michael@0: the transform. michael@0: _y: The buffer to store the result in. michael@0: Data will be placed in every 8th entry (e.g., in a column of an 8x8 michael@0: block). michael@0: _x: The input coefficients. michael@0: Only the first 2 entries are used. michael@0: The other 6 are assumed to be 0.*/ michael@0: static void idct8_2(ogg_int16_t *_y,const ogg_int16_t _x[8]){ michael@0: ogg_int32_t t[8]; michael@0: ogg_int32_t r; michael@0: /*Stage 1:*/ michael@0: t[0]=OC_C4S4*_x[0]>>16; michael@0: t[4]=OC_C7S1*_x[1]>>16; michael@0: t[7]=OC_C1S7*_x[1]>>16; michael@0: /*Stage 2:*/ michael@0: t[5]=OC_C4S4*t[4]>>16; michael@0: t[6]=OC_C4S4*t[7]>>16; michael@0: /*Stage 3:*/ michael@0: r=t[6]+t[5]; michael@0: t[5]=t[6]-t[5]; michael@0: t[6]=r; michael@0: /*Stage 4:*/ michael@0: _y[0<<3]=(ogg_int16_t)(t[0]+t[7]); michael@0: _y[1<<3]=(ogg_int16_t)(t[0]+t[6]); michael@0: _y[2<<3]=(ogg_int16_t)(t[0]+t[5]); michael@0: _y[3<<3]=(ogg_int16_t)(t[0]+t[4]); michael@0: _y[4<<3]=(ogg_int16_t)(t[0]-t[4]); michael@0: _y[5<<3]=(ogg_int16_t)(t[0]-t[5]); michael@0: _y[6<<3]=(ogg_int16_t)(t[0]-t[6]); michael@0: _y[7<<3]=(ogg_int16_t)(t[0]-t[7]); michael@0: } michael@0: michael@0: /*Performs an inverse 8 point Type-II DCT transform. michael@0: The output is scaled by a factor of 2 relative to the orthonormal version of michael@0: the transform. michael@0: _y: The buffer to store the result in. michael@0: Data will be placed in every 8th entry (e.g., in a column of an 8x8 michael@0: block). michael@0: _x: The input coefficients. michael@0: Only the first entry is used. michael@0: The other 7 are assumed to be 0.*/ michael@0: static void idct8_1(ogg_int16_t *_y,const ogg_int16_t _x[1]){ michael@0: _y[0<<3]=_y[1<<3]=_y[2<<3]=_y[3<<3]= michael@0: _y[4<<3]=_y[5<<3]=_y[6<<3]=_y[7<<3]=(ogg_int16_t)(OC_C4S4*_x[0]>>16); michael@0: } michael@0: michael@0: /*Performs an inverse 8x8 Type-II DCT transform. michael@0: The input is assumed to be scaled by a factor of 4 relative to orthonormal michael@0: version of the transform. michael@0: All coefficients but the first 3 in zig-zag scan order are assumed to be 0: michael@0: x x 0 0 0 0 0 0 michael@0: x 0 0 0 0 0 0 0 michael@0: 0 0 0 0 0 0 0 0 michael@0: 0 0 0 0 0 0 0 0 michael@0: 0 0 0 0 0 0 0 0 michael@0: 0 0 0 0 0 0 0 0 michael@0: 0 0 0 0 0 0 0 0 michael@0: 0 0 0 0 0 0 0 0 michael@0: _y: The buffer to store the result in. michael@0: This may be the same as _x. michael@0: _x: The input coefficients.*/ michael@0: static void oc_idct8x8_3(ogg_int16_t _y[64],ogg_int16_t _x[64]){ michael@0: ogg_int16_t w[64]; michael@0: int i; michael@0: /*Transform rows of x into columns of w.*/ michael@0: idct8_2(w,_x); michael@0: idct8_1(w+1,_x+8); michael@0: /*Transform rows of w into columns of y.*/ michael@0: for(i=0;i<8;i++)idct8_2(_y+i,w+i*8); michael@0: /*Adjust for the scale factor.*/ michael@0: for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4); michael@0: /*Clear input data for next block (decoder only).*/ michael@0: if(_x!=_y)_x[0]=_x[1]=_x[8]=0; michael@0: } michael@0: michael@0: /*Performs an inverse 8x8 Type-II DCT transform. michael@0: The input is assumed to be scaled by a factor of 4 relative to orthonormal michael@0: version of the transform. michael@0: All coefficients but the first 10 in zig-zag scan order are assumed to be 0: michael@0: x x x x 0 0 0 0 michael@0: x x x 0 0 0 0 0 michael@0: x x 0 0 0 0 0 0 michael@0: x 0 0 0 0 0 0 0 michael@0: 0 0 0 0 0 0 0 0 michael@0: 0 0 0 0 0 0 0 0 michael@0: 0 0 0 0 0 0 0 0 michael@0: 0 0 0 0 0 0 0 0 michael@0: _y: The buffer to store the result in. michael@0: This may be the same as _x. michael@0: _x: The input coefficients.*/ michael@0: static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){ michael@0: ogg_int16_t w[64]; michael@0: int i; michael@0: /*Transform rows of x into columns of w.*/ michael@0: idct8_4(w,_x); michael@0: idct8_3(w+1,_x+8); michael@0: idct8_2(w+2,_x+16); michael@0: idct8_1(w+3,_x+24); michael@0: /*Transform rows of w into columns of y.*/ michael@0: for(i=0;i<8;i++)idct8_4(_y+i,w+i*8); michael@0: /*Adjust for the scale factor.*/ michael@0: for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4); michael@0: /*Clear input data for next block (decoder only).*/ michael@0: if(_x!=_y)_x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0; michael@0: } michael@0: michael@0: /*Performs an inverse 8x8 Type-II DCT transform. michael@0: The input is assumed to be scaled by a factor of 4 relative to orthonormal michael@0: version of the transform. michael@0: _y: The buffer to store the result in. michael@0: This may be the same as _x. michael@0: _x: The input coefficients.*/ michael@0: static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){ michael@0: ogg_int16_t w[64]; michael@0: int i; michael@0: /*Transform rows of x into columns of w.*/ michael@0: for(i=0;i<8;i++)idct8(w+i,_x+i*8); michael@0: /*Transform rows of w into columns of y.*/ michael@0: for(i=0;i<8;i++)idct8(_y+i,w+i*8); michael@0: /*Adjust for the scale factor.*/ michael@0: for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4); michael@0: if(_x!=_y)for(i=0;i<64;i++)_x[i]=0; michael@0: } michael@0: michael@0: /*Performs an inverse 8x8 Type-II DCT transform. michael@0: The input is assumed to be scaled by a factor of 4 relative to orthonormal michael@0: version of the transform.*/ michael@0: void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ michael@0: /*_last_zzi is subtly different from an actual count of the number of michael@0: coefficients we decoded for this block. michael@0: It contains the value of zzi BEFORE the final token in the block was michael@0: decoded. michael@0: In most cases this is an EOB token (the continuation of an EOB run from a michael@0: previous block counts), and so this is the same as the coefficient count. michael@0: However, in the case that the last token was NOT an EOB token, but filled michael@0: the block up with exactly 64 coefficients, _last_zzi will be less than 64. michael@0: Provided the last token was not a pure zero run, the minimum value it can michael@0: be is 46, and so that doesn't affect any of the cases in this routine. michael@0: However, if the last token WAS a pure zero run of length 63, then _last_zzi michael@0: will be 1 while the number of coefficients decoded is 64. michael@0: Thus, we will trigger the following special case, where the real michael@0: coefficient count would not. michael@0: Note also that a zero run of length 64 will give _last_zzi a value of 0, michael@0: but we still process the DC coefficient, which might have a non-zero value michael@0: due to DC prediction. michael@0: Although convoluted, this is arguably the correct behavior: it allows us to michael@0: use a smaller transform when the block ends with a long zero run instead michael@0: of a normal EOB token. michael@0: It could be smarter... multiple separate zero runs at the end of a block michael@0: will fool it, but an encoder that generates these really deserves what it michael@0: gets. michael@0: Needless to say we inherited this approach from VP3.*/ michael@0: /*Then perform the iDCT.*/ michael@0: if(_last_zzi<=3)oc_idct8x8_3(_y,_x); michael@0: else if(_last_zzi<=10)oc_idct8x8_10(_y,_x); michael@0: else oc_idct8x8_slow(_y,_x); michael@0: }