media/libtheora/lib/x86/sse2idct.c

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /********************************************************************
michael@0 2 * *
michael@0 3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
michael@0 4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
michael@0 5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
michael@0 6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
michael@0 7 * *
michael@0 8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
michael@0 9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
michael@0 10 * *
michael@0 11 ********************************************************************
michael@0 12
michael@0 13 function:
michael@0 14 last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
michael@0 15
michael@0 16 ********************************************************************/
michael@0 17
michael@0 18 /*SSE2 acceleration of Theora's iDCT.*/
michael@0 19 #include "x86int.h"
michael@0 20 #include "sse2trans.h"
michael@0 21 #include "../dct.h"
michael@0 22
michael@0 23 #if defined(OC_X86_ASM)
michael@0 24
michael@0 25 /*A table of constants used by the MMX routines.*/
michael@0 26 const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={
michael@0 27 8, 8, 8, 8, 8, 8, 8, 8,
michael@0 28 OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,
michael@0 29 OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,
michael@0 30 OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,
michael@0 31 OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,
michael@0 32 OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,
michael@0 33 OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,
michael@0 34 OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1
michael@0 35 };
michael@0 36
michael@0 37
michael@0 38 /*Performs the first three stages of the iDCT.
michael@0 39 xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input
michael@0 40 (accessed in that order).
michael@0 41 The remaining rows must be in _x at their corresponding locations.
michael@0 42 On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
michael@0 43 contain rows 4 through 7.*/
michael@0 44 #define OC_IDCT_8x8_ABC(_x) \
michael@0 45 "#OC_IDCT_8x8_ABC\n\t" \
michael@0 46 /*Stage 1:*/ \
michael@0 47 /*2-3 rotation by 6pi/16. \
michael@0 48 xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \
michael@0 49 "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \
michael@0 50 "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \
michael@0 51 "movdqa %%xmm1,%%xmm0\n\t" \
michael@0 52 "pmulhw %%xmm2,%%xmm1\n\t" \
michael@0 53 "movdqa %%xmm4,%%xmm7\n\t" \
michael@0 54 "pmulhw %%xmm6,%%xmm0\n\t" \
michael@0 55 "pmulhw %%xmm2,%%xmm7\n\t" \
michael@0 56 "pmulhw %%xmm6,%%xmm4\n\t" \
michael@0 57 "paddw %%xmm6,%%xmm0\n\t" \
michael@0 58 "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \
michael@0 59 "paddw %%xmm1,%%xmm2\n\t" \
michael@0 60 "psubw %%xmm0,%%xmm7\n\t" \
michael@0 61 "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
michael@0 62 "paddw %%xmm4,%%xmm2\n\t" \
michael@0 63 "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \
michael@0 64 "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
michael@0 65 /*5-6 rotation by 3pi/16. \
michael@0 66 xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \
michael@0 67 "movdqa %%xmm4,%%xmm2\n\t" \
michael@0 68 "movdqa %%xmm6,%%xmm1\n\t" \
michael@0 69 "pmulhw %%xmm3,%%xmm4\n\t" \
michael@0 70 "pmulhw %%xmm5,%%xmm1\n\t" \
michael@0 71 "pmulhw %%xmm3,%%xmm6\n\t" \
michael@0 72 "pmulhw %%xmm5,%%xmm2\n\t" \
michael@0 73 "paddw %%xmm3,%%xmm4\n\t" \
michael@0 74 "paddw %%xmm5,%%xmm3\n\t" \
michael@0 75 "paddw %%xmm6,%%xmm3\n\t" \
michael@0 76 "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \
michael@0 77 "paddw %%xmm5,%%xmm1\n\t" \
michael@0 78 "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \
michael@0 79 "paddw %%xmm3,%%xmm2\n\t" \
michael@0 80 "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
michael@0 81 "psubw %%xmm4,%%xmm1\n\t" \
michael@0 82 "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \
michael@0 83 /*4-7 rotation by 7pi/16. \
michael@0 84 xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \
michael@0 85 "movdqa %%xmm3,%%xmm0\n\t" \
michael@0 86 "movdqa %%xmm4,%%xmm7\n\t" \
michael@0 87 "pmulhw %%xmm5,%%xmm3\n\t" \
michael@0 88 "pmulhw %%xmm5,%%xmm7\n\t" \
michael@0 89 "pmulhw %%xmm6,%%xmm4\n\t" \
michael@0 90 "pmulhw %%xmm6,%%xmm0\n\t" \
michael@0 91 "paddw %%xmm6,%%xmm4\n\t" \
michael@0 92 "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \
michael@0 93 "paddw %%xmm5,%%xmm7\n\t" \
michael@0 94 "psubw %%xmm4,%%xmm3\n\t" \
michael@0 95 "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
michael@0 96 "paddw %%xmm7,%%xmm0\n\t" \
michael@0 97 "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \
michael@0 98 /*0-1 butterfly. \
michael@0 99 xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \
michael@0 100 "paddw %%xmm7,%%xmm6\n\t" \
michael@0 101 "movdqa %%xmm4,%%xmm5\n\t" \
michael@0 102 "pmulhw %%xmm6,%%xmm4\n\t" \
michael@0 103 "paddw %%xmm7,%%xmm7\n\t" \
michael@0 104 "psubw %%xmm6,%%xmm7\n\t" \
michael@0 105 "paddw %%xmm6,%%xmm4\n\t" \
michael@0 106 /*Stage 2:*/ \
michael@0 107 /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \
michael@0 108 7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \
michael@0 109 "movdqa %%xmm3,%%xmm6\n\t" \
michael@0 110 "paddw %%xmm1,%%xmm3\n\t" \
michael@0 111 "psubw %%xmm1,%%xmm6\n\t" \
michael@0 112 "movdqa %%xmm5,%%xmm1\n\t" \
michael@0 113 "pmulhw %%xmm7,%%xmm5\n\t" \
michael@0 114 "paddw %%xmm7,%%xmm5\n\t" \
michael@0 115 "movdqa %%xmm0,%%xmm7\n\t" \
michael@0 116 "paddw %%xmm2,%%xmm0\n\t" \
michael@0 117 "psubw %%xmm2,%%xmm7\n\t" \
michael@0 118 "movdqa %%xmm1,%%xmm2\n\t" \
michael@0 119 "pmulhw %%xmm6,%%xmm1\n\t" \
michael@0 120 "pmulhw %%xmm7,%%xmm2\n\t" \
michael@0 121 "paddw %%xmm6,%%xmm1\n\t" \
michael@0 122 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
michael@0 123 "paddw %%xmm7,%%xmm2\n\t" \
michael@0 124 "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
michael@0 125 /*Stage 3: \
michael@0 126 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
michael@0 127 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
michael@0 128 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
michael@0 129 "paddw %%xmm2,%%xmm1\n\t" \
michael@0 130 "paddw %%xmm5,%%xmm6\n\t" \
michael@0 131 "paddw %%xmm4,%%xmm7\n\t" \
michael@0 132 "paddw %%xmm2,%%xmm2\n\t" \
michael@0 133 "paddw %%xmm4,%%xmm4\n\t" \
michael@0 134 "paddw %%xmm5,%%xmm5\n\t" \
michael@0 135 "psubw %%xmm1,%%xmm2\n\t" \
michael@0 136 "psubw %%xmm7,%%xmm4\n\t" \
michael@0 137 "psubw %%xmm6,%%xmm5\n\t" \
michael@0 138
michael@0 139 /*Performs the last stage of the iDCT.
michael@0 140 On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
michael@0 141 contain rows 4 through 7.
michael@0 142 On output, xmm0 through xmm7 contain the corresponding rows.*/
michael@0 143 #define OC_IDCT_8x8_D \
michael@0 144 "#OC_IDCT_8x8_D\n\t" \
michael@0 145 /*Stage 4: \
michael@0 146 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
michael@0 147 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
michael@0 148 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
michael@0 149 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
michael@0 150 "psubw %%xmm0,%%xmm7\n\t" \
michael@0 151 "psubw %%xmm1,%%xmm6\n\t" \
michael@0 152 "psubw %%xmm2,%%xmm5\n\t" \
michael@0 153 "psubw %%xmm3,%%xmm4\n\t" \
michael@0 154 "paddw %%xmm0,%%xmm0\n\t" \
michael@0 155 "paddw %%xmm1,%%xmm1\n\t" \
michael@0 156 "paddw %%xmm2,%%xmm2\n\t" \
michael@0 157 "paddw %%xmm3,%%xmm3\n\t" \
michael@0 158 "paddw %%xmm7,%%xmm0\n\t" \
michael@0 159 "paddw %%xmm6,%%xmm1\n\t" \
michael@0 160 "paddw %%xmm5,%%xmm2\n\t" \
michael@0 161 "paddw %%xmm4,%%xmm3\n\t" \
michael@0 162
michael@0 163 /*Performs the last stage of the iDCT.
michael@0 164 On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
michael@0 165 contain rows 4 through 7.
michael@0 166 On output, xmm0 through xmm7 contain the corresponding rows.*/
michael@0 167 #define OC_IDCT_8x8_D_STORE \
michael@0 168 "#OC_IDCT_8x8_D_STORE\n\t" \
michael@0 169 /*Stage 4: \
michael@0 170 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
michael@0 171 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
michael@0 172 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
michael@0 173 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
michael@0 174 "psubw %%xmm3,%%xmm4\n\t" \
michael@0 175 "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
michael@0 176 "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \
michael@0 177 "psubw %%xmm0,%%xmm7\n\t" \
michael@0 178 "psubw %%xmm1,%%xmm6\n\t" \
michael@0 179 "psubw %%xmm2,%%xmm5\n\t" \
michael@0 180 "paddw %%xmm4,%%xmm7\n\t" \
michael@0 181 "paddw %%xmm4,%%xmm6\n\t" \
michael@0 182 "paddw %%xmm4,%%xmm5\n\t" \
michael@0 183 "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \
michael@0 184 "paddw %%xmm0,%%xmm0\n\t" \
michael@0 185 "paddw %%xmm1,%%xmm1\n\t" \
michael@0 186 "paddw %%xmm2,%%xmm2\n\t" \
michael@0 187 "paddw %%xmm3,%%xmm3\n\t" \
michael@0 188 "paddw %%xmm7,%%xmm0\n\t" \
michael@0 189 "paddw %%xmm6,%%xmm1\n\t" \
michael@0 190 "psraw $4,%%xmm0\n\t" \
michael@0 191 "paddw %%xmm5,%%xmm2\n\t" \
michael@0 192 "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \
michael@0 193 "psraw $4,%%xmm1\n\t" \
michael@0 194 "paddw %%xmm4,%%xmm3\n\t" \
michael@0 195 "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \
michael@0 196 "psraw $4,%%xmm2\n\t" \
michael@0 197 "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \
michael@0 198 "psraw $4,%%xmm3\n\t" \
michael@0 199 "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \
michael@0 200 "psraw $4,%%xmm4\n\t" \
michael@0 201 "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
michael@0 202 "psraw $4,%%xmm5\n\t" \
michael@0 203 "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \
michael@0 204 "psraw $4,%%xmm6\n\t" \
michael@0 205 "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \
michael@0 206 "psraw $4,%%xmm7\n\t" \
michael@0 207 "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
michael@0 208
michael@0 209 static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
michael@0 210 OC_ALIGN16(ogg_int16_t buf[16]);
michael@0 211 /*This routine accepts an 8x8 matrix pre-transposed.*/
michael@0 212 __asm__ __volatile__(
michael@0 213 /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
michael@0 214 "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t"
michael@0 215 "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t"
michael@0 216 "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t"
michael@0 217 "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t"
michael@0 218 OC_IDCT_8x8_ABC(x)
michael@0 219 OC_IDCT_8x8_D
michael@0 220 OC_TRANSPOSE_8x8
michael@0 221 /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/
michael@0 222 "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t"
michael@0 223 "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t"
michael@0 224 "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t"
michael@0 225 "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t"
michael@0 226 OC_IDCT_8x8_ABC(y)
michael@0 227 OC_IDCT_8x8_D_STORE
michael@0 228 :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)),
michael@0 229 [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
michael@0 230 :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
michael@0 231 [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
michael@0 232 );
michael@0 233 if(_x!=_y){
michael@0 234 int i;
michael@0 235 __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
michael@0 236 /*Clear input data for next block (decoder only).*/
michael@0 237 for(i=0;i<2;i++){
michael@0 238 __asm__ __volatile__(
michael@0 239 "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
michael@0 240 "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
michael@0 241 "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
michael@0 242 "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
michael@0 243 :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
michael@0 244 );
michael@0 245 }
michael@0 246 }
michael@0 247 }
michael@0 248
michael@0 249 /*For the first step of the 10-coefficient version of the 8x8 iDCT, we only
michael@0 250 need to work with four columns at a time.
michael@0 251 Doing this in MMX is faster on processors with a 64-bit data path.*/
michael@0 252 #define OC_IDCT_8x8_10_MMX \
michael@0 253 "#OC_IDCT_8x8_10_MMX\n\t" \
michael@0 254 /*Stage 1:*/ \
michael@0 255 /*2-3 rotation by 6pi/16. \
michael@0 256 mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \
michael@0 257 "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
michael@0 258 "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \
michael@0 259 "pmulhw %%mm2,%%mm6\n\t" \
michael@0 260 "pmulhw %%mm2,%%mm7\n\t" \
michael@0 261 "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \
michael@0 262 "paddw %%mm6,%%mm2\n\t" \
michael@0 263 "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
michael@0 264 "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \
michael@0 265 "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
michael@0 266 /*5-6 rotation by 3pi/16. \
michael@0 267 mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \
michael@0 268 "pmulhw %%mm3,%%mm5\n\t" \
michael@0 269 "pmulhw %%mm3,%%mm2\n\t" \
michael@0 270 "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \
michael@0 271 "paddw %%mm3,%%mm5\n\t" \
michael@0 272 "paddw %%mm3,%%mm2\n\t" \
michael@0 273 "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
michael@0 274 /*4-7 rotation by 7pi/16. \
michael@0 275 mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \
michael@0 276 "pmulhw %%mm1,%%mm3\n\t" \
michael@0 277 "pmulhw %%mm1,%%mm7\n\t" \
michael@0 278 "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
michael@0 279 "movq %%mm3,%%mm6\n\t" \
michael@0 280 "paddw %%mm1,%%mm7\n\t" \
michael@0 281 /*0-1 butterfly. \
michael@0 282 mm4=C4, mm0=X0, X4=0.*/ \
michael@0 283 /*Stage 2:*/ \
michael@0 284 /*4-5 butterfly: mm3=t[4], mm5=t[5] \
michael@0 285 7-6 butterfly: mm2=t[6], mm7=t[7]*/ \
michael@0 286 "psubw %%mm5,%%mm3\n\t" \
michael@0 287 "paddw %%mm5,%%mm6\n\t" \
michael@0 288 "movq %%mm4,%%mm1\n\t" \
michael@0 289 "pmulhw %%mm0,%%mm4\n\t" \
michael@0 290 "paddw %%mm0,%%mm4\n\t" \
michael@0 291 "movq %%mm7,%%mm0\n\t" \
michael@0 292 "movq %%mm4,%%mm5\n\t" \
michael@0 293 "paddw %%mm2,%%mm0\n\t" \
michael@0 294 "psubw %%mm2,%%mm7\n\t" \
michael@0 295 "movq %%mm1,%%mm2\n\t" \
michael@0 296 "pmulhw %%mm6,%%mm1\n\t" \
michael@0 297 "pmulhw %%mm7,%%mm2\n\t" \
michael@0 298 "paddw %%mm6,%%mm1\n\t" \
michael@0 299 "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \
michael@0 300 "paddw %%mm7,%%mm2\n\t" \
michael@0 301 "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \
michael@0 302 /*Stage 3: \
michael@0 303 6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \
michael@0 304 0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \
michael@0 305 1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \
michael@0 306 "paddw %%mm2,%%mm1\n\t" \
michael@0 307 "paddw %%mm5,%%mm6\n\t" \
michael@0 308 "paddw %%mm4,%%mm7\n\t" \
michael@0 309 "paddw %%mm2,%%mm2\n\t" \
michael@0 310 "paddw %%mm4,%%mm4\n\t" \
michael@0 311 "paddw %%mm5,%%mm5\n\t" \
michael@0 312 "psubw %%mm1,%%mm2\n\t" \
michael@0 313 "psubw %%mm7,%%mm4\n\t" \
michael@0 314 "psubw %%mm6,%%mm5\n\t" \
michael@0 315 /*Stage 4: \
michael@0 316 0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \
michael@0 317 1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \
michael@0 318 2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \
michael@0 319 3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \
michael@0 320 "psubw %%mm0,%%mm7\n\t" \
michael@0 321 "psubw %%mm1,%%mm6\n\t" \
michael@0 322 "psubw %%mm2,%%mm5\n\t" \
michael@0 323 "psubw %%mm3,%%mm4\n\t" \
michael@0 324 "paddw %%mm0,%%mm0\n\t" \
michael@0 325 "paddw %%mm1,%%mm1\n\t" \
michael@0 326 "paddw %%mm2,%%mm2\n\t" \
michael@0 327 "paddw %%mm3,%%mm3\n\t" \
michael@0 328 "paddw %%mm7,%%mm0\n\t" \
michael@0 329 "paddw %%mm6,%%mm1\n\t" \
michael@0 330 "paddw %%mm5,%%mm2\n\t" \
michael@0 331 "paddw %%mm4,%%mm3\n\t" \
michael@0 332
michael@0 333 #define OC_IDCT_8x8_10_ABC \
michael@0 334 "#OC_IDCT_8x8_10_ABC\n\t" \
michael@0 335 /*Stage 1:*/ \
michael@0 336 /*2-3 rotation by 6pi/16. \
michael@0 337 xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \
michael@0 338 "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \
michael@0 339 "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \
michael@0 340 "pmulhw %%xmm2,%%xmm6\n\t" \
michael@0 341 "pmulhw %%xmm2,%%xmm7\n\t" \
michael@0 342 "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \
michael@0 343 "paddw %%xmm6,%%xmm2\n\t" \
michael@0 344 "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
michael@0 345 "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \
michael@0 346 "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
michael@0 347 /*5-6 rotation by 3pi/16. \
michael@0 348 xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \
michael@0 349 "pmulhw %%xmm3,%%xmm5\n\t" \
michael@0 350 "pmulhw %%xmm3,%%xmm2\n\t" \
michael@0 351 "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \
michael@0 352 "paddw %%xmm3,%%xmm5\n\t" \
michael@0 353 "paddw %%xmm3,%%xmm2\n\t" \
michael@0 354 "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
michael@0 355 /*4-7 rotation by 7pi/16. \
michael@0 356 xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \
michael@0 357 "pmulhw %%xmm1,%%xmm3\n\t" \
michael@0 358 "pmulhw %%xmm1,%%xmm7\n\t" \
michael@0 359 "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
michael@0 360 "movdqa %%xmm3,%%xmm6\n\t" \
michael@0 361 "paddw %%xmm1,%%xmm7\n\t" \
michael@0 362 /*0-1 butterfly. \
michael@0 363 xmm4=C4, xmm0=X0, X4=0.*/ \
michael@0 364 /*Stage 2:*/ \
michael@0 365 /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \
michael@0 366 7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \
michael@0 367 "psubw %%xmm5,%%xmm3\n\t" \
michael@0 368 "paddw %%xmm5,%%xmm6\n\t" \
michael@0 369 "movdqa %%xmm4,%%xmm1\n\t" \
michael@0 370 "pmulhw %%xmm0,%%xmm4\n\t" \
michael@0 371 "paddw %%xmm0,%%xmm4\n\t" \
michael@0 372 "movdqa %%xmm7,%%xmm0\n\t" \
michael@0 373 "movdqa %%xmm4,%%xmm5\n\t" \
michael@0 374 "paddw %%xmm2,%%xmm0\n\t" \
michael@0 375 "psubw %%xmm2,%%xmm7\n\t" \
michael@0 376 "movdqa %%xmm1,%%xmm2\n\t" \
michael@0 377 "pmulhw %%xmm6,%%xmm1\n\t" \
michael@0 378 "pmulhw %%xmm7,%%xmm2\n\t" \
michael@0 379 "paddw %%xmm6,%%xmm1\n\t" \
michael@0 380 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
michael@0 381 "paddw %%xmm7,%%xmm2\n\t" \
michael@0 382 "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
michael@0 383 /*Stage 3: \
michael@0 384 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
michael@0 385 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
michael@0 386 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
michael@0 387 "paddw %%xmm2,%%xmm1\n\t" \
michael@0 388 "paddw %%xmm5,%%xmm6\n\t" \
michael@0 389 "paddw %%xmm4,%%xmm7\n\t" \
michael@0 390 "paddw %%xmm2,%%xmm2\n\t" \
michael@0 391 "paddw %%xmm4,%%xmm4\n\t" \
michael@0 392 "paddw %%xmm5,%%xmm5\n\t" \
michael@0 393 "psubw %%xmm1,%%xmm2\n\t" \
michael@0 394 "psubw %%xmm7,%%xmm4\n\t" \
michael@0 395 "psubw %%xmm6,%%xmm5\n\t" \
michael@0 396
michael@0 397 static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
michael@0 398 OC_ALIGN16(ogg_int16_t buf[16]);
michael@0 399 /*This routine accepts an 8x8 matrix pre-transposed.*/
michael@0 400 __asm__ __volatile__(
michael@0 401 "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t"
michael@0 402 "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t"
michael@0 403 "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t"
michael@0 404 "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t"
michael@0 405 OC_IDCT_8x8_10_MMX
michael@0 406 OC_TRANSPOSE_8x4_MMX2SSE
michael@0 407 OC_IDCT_8x8_10_ABC
michael@0 408 OC_IDCT_8x8_D_STORE
michael@0 409 :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)),
michael@0 410 [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
michael@0 411 :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
michael@0 412 [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
michael@0 413 );
michael@0 414 if(_x!=_y){
michael@0 415 /*Clear input data for next block (decoder only).*/
michael@0 416 __asm__ __volatile__(
michael@0 417 "pxor %%mm0,%%mm0\n\t"
michael@0 418 "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
michael@0 419 "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
michael@0 420 "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
michael@0 421 "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
michael@0 422 :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
michael@0 423 );
michael@0 424 }
michael@0 425 }
michael@0 426
michael@0 427 /*Performs an inverse 8x8 Type-II DCT transform.
michael@0 428 The input is assumed to be scaled by a factor of 4 relative to orthonormal
michael@0 429 version of the transform.*/
michael@0 430 void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
michael@0 431 /*_last_zzi is subtly different from an actual count of the number of
michael@0 432 coefficients we decoded for this block.
michael@0 433 It contains the value of zzi BEFORE the final token in the block was
michael@0 434 decoded.
michael@0 435 In most cases this is an EOB token (the continuation of an EOB run from a
michael@0 436 previous block counts), and so this is the same as the coefficient count.
michael@0 437 However, in the case that the last token was NOT an EOB token, but filled
michael@0 438 the block up with exactly 64 coefficients, _last_zzi will be less than 64.
michael@0 439 Provided the last token was not a pure zero run, the minimum value it can
michael@0 440 be is 46, and so that doesn't affect any of the cases in this routine.
michael@0 441 However, if the last token WAS a pure zero run of length 63, then _last_zzi
michael@0 442 will be 1 while the number of coefficients decoded is 64.
michael@0 443 Thus, we will trigger the following special case, where the real
michael@0 444 coefficient count would not.
michael@0 445 Note also that a zero run of length 64 will give _last_zzi a value of 0,
michael@0 446 but we still process the DC coefficient, which might have a non-zero value
michael@0 447 due to DC prediction.
michael@0 448 Although convoluted, this is arguably the correct behavior: it allows us to
michael@0 449 use a smaller transform when the block ends with a long zero run instead
michael@0 450 of a normal EOB token.
michael@0 451 It could be smarter... multiple separate zero runs at the end of a block
michael@0 452 will fool it, but an encoder that generates these really deserves what it
michael@0 453 gets.
michael@0 454 Needless to say we inherited this approach from VP3.*/
michael@0 455 /*Then perform the iDCT.*/
michael@0 456 if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x);
michael@0 457 else oc_idct8x8_slow_sse2(_y,_x);
michael@0 458 }
michael@0 459
michael@0 460 #endif

mercurial