media/libtheora/lib/x86/sse2idct.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libtheora/lib/x86/sse2idct.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,460 @@
     1.4 +/********************************************************************
     1.5 + *                                                                  *
     1.6 + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
     1.7 + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
     1.8 + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
     1.9 + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
    1.10 + *                                                                  *
    1.11 + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
    1.12 + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
    1.13 + *                                                                  *
    1.14 + ********************************************************************
    1.15 +
    1.16 +  function:
    1.17 +    last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
    1.18 +
    1.19 + ********************************************************************/
    1.20 +
    1.21 +/*SSE2 acceleration of Theora's iDCT.*/
    1.22 +#include "x86int.h"
    1.23 +#include "sse2trans.h"
    1.24 +#include "../dct.h"
    1.25 +
    1.26 +#if defined(OC_X86_ASM)
    1.27 +
    1.28 +/*A table of constants used by the MMX routines.*/
    1.29 +const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={
    1.30 +        8,      8,      8,      8,      8,      8,      8,      8,
    1.31 +  OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,
    1.32 +  OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,
    1.33 +  OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,
    1.34 +  OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,
    1.35 +  OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,
    1.36 +  OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,
    1.37 +  OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1
    1.38 +};
    1.39 +
    1.40 +
    1.41 +/*Performs the first three stages of the iDCT.
    1.42 +  xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input
    1.43 +   (accessed in that order).
    1.44 +  The remaining rows must be in _x at their corresponding locations.
    1.45 +  On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
    1.46 +   contain rows 4 through 7.*/
    1.47 +#define OC_IDCT_8x8_ABC(_x) \
    1.48 +  "#OC_IDCT_8x8_ABC\n\t" \
    1.49 +  /*Stage 1:*/ \
    1.50 +  /*2-3 rotation by 6pi/16. \
    1.51 +    xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \
    1.52 +  "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \
    1.53 +  "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \
    1.54 +  "movdqa %%xmm1,%%xmm0\n\t" \
    1.55 +  "pmulhw %%xmm2,%%xmm1\n\t" \
    1.56 +  "movdqa %%xmm4,%%xmm7\n\t" \
    1.57 +  "pmulhw %%xmm6,%%xmm0\n\t" \
    1.58 +  "pmulhw %%xmm2,%%xmm7\n\t" \
    1.59 +  "pmulhw %%xmm6,%%xmm4\n\t" \
    1.60 +  "paddw %%xmm6,%%xmm0\n\t" \
    1.61 +  "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \
    1.62 +  "paddw %%xmm1,%%xmm2\n\t" \
    1.63 +  "psubw %%xmm0,%%xmm7\n\t" \
    1.64 +  "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
    1.65 +  "paddw %%xmm4,%%xmm2\n\t" \
    1.66 +  "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \
    1.67 +  "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
    1.68 +  /*5-6 rotation by 3pi/16. \
    1.69 +    xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \
    1.70 +  "movdqa %%xmm4,%%xmm2\n\t" \
    1.71 +  "movdqa %%xmm6,%%xmm1\n\t" \
    1.72 +  "pmulhw %%xmm3,%%xmm4\n\t" \
    1.73 +  "pmulhw %%xmm5,%%xmm1\n\t" \
    1.74 +  "pmulhw %%xmm3,%%xmm6\n\t" \
    1.75 +  "pmulhw %%xmm5,%%xmm2\n\t" \
    1.76 +  "paddw %%xmm3,%%xmm4\n\t" \
    1.77 +  "paddw %%xmm5,%%xmm3\n\t" \
    1.78 +  "paddw %%xmm6,%%xmm3\n\t" \
    1.79 +  "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \
    1.80 +  "paddw %%xmm5,%%xmm1\n\t" \
    1.81 +  "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \
    1.82 +  "paddw %%xmm3,%%xmm2\n\t" \
    1.83 +  "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
    1.84 +  "psubw %%xmm4,%%xmm1\n\t" \
    1.85 +  "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \
    1.86 +  /*4-7 rotation by 7pi/16. \
    1.87 +    xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \
    1.88 +  "movdqa %%xmm3,%%xmm0\n\t" \
    1.89 +  "movdqa %%xmm4,%%xmm7\n\t" \
    1.90 +  "pmulhw %%xmm5,%%xmm3\n\t" \
    1.91 +  "pmulhw %%xmm5,%%xmm7\n\t" \
    1.92 +  "pmulhw %%xmm6,%%xmm4\n\t" \
    1.93 +  "pmulhw %%xmm6,%%xmm0\n\t" \
    1.94 +  "paddw %%xmm6,%%xmm4\n\t" \
    1.95 +  "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \
    1.96 +  "paddw %%xmm5,%%xmm7\n\t" \
    1.97 +  "psubw %%xmm4,%%xmm3\n\t" \
    1.98 +  "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
    1.99 +  "paddw %%xmm7,%%xmm0\n\t" \
   1.100 +  "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \
   1.101 +  /*0-1 butterfly. \
   1.102 +    xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \
   1.103 +  "paddw %%xmm7,%%xmm6\n\t" \
   1.104 +  "movdqa %%xmm4,%%xmm5\n\t" \
   1.105 +  "pmulhw %%xmm6,%%xmm4\n\t" \
   1.106 +  "paddw %%xmm7,%%xmm7\n\t" \
   1.107 +  "psubw %%xmm6,%%xmm7\n\t" \
   1.108 +  "paddw %%xmm6,%%xmm4\n\t" \
   1.109 +  /*Stage 2:*/ \
   1.110 +  /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \
   1.111 +    7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \
   1.112 +  "movdqa %%xmm3,%%xmm6\n\t" \
   1.113 +  "paddw %%xmm1,%%xmm3\n\t" \
   1.114 +  "psubw %%xmm1,%%xmm6\n\t" \
   1.115 +  "movdqa %%xmm5,%%xmm1\n\t" \
   1.116 +  "pmulhw %%xmm7,%%xmm5\n\t" \
   1.117 +  "paddw %%xmm7,%%xmm5\n\t" \
   1.118 +  "movdqa %%xmm0,%%xmm7\n\t" \
   1.119 +  "paddw %%xmm2,%%xmm0\n\t" \
   1.120 +  "psubw %%xmm2,%%xmm7\n\t" \
   1.121 +  "movdqa %%xmm1,%%xmm2\n\t" \
   1.122 +  "pmulhw %%xmm6,%%xmm1\n\t" \
   1.123 +  "pmulhw %%xmm7,%%xmm2\n\t" \
   1.124 +  "paddw %%xmm6,%%xmm1\n\t" \
   1.125 +  "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
   1.126 +  "paddw %%xmm7,%%xmm2\n\t" \
   1.127 +  "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
   1.128 +  /*Stage 3: \
   1.129 +    6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
   1.130 +    0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
   1.131 +    1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
   1.132 +  "paddw %%xmm2,%%xmm1\n\t" \
   1.133 +  "paddw %%xmm5,%%xmm6\n\t" \
   1.134 +  "paddw %%xmm4,%%xmm7\n\t" \
   1.135 +  "paddw %%xmm2,%%xmm2\n\t" \
   1.136 +  "paddw %%xmm4,%%xmm4\n\t" \
   1.137 +  "paddw %%xmm5,%%xmm5\n\t" \
   1.138 +  "psubw %%xmm1,%%xmm2\n\t" \
   1.139 +  "psubw %%xmm7,%%xmm4\n\t" \
   1.140 +  "psubw %%xmm6,%%xmm5\n\t" \
   1.141 +
   1.142 +/*Performs the last stage of the iDCT.
   1.143 +  On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
   1.144 +   contain rows 4 through 7.
   1.145 +  On output, xmm0 through xmm7 contain the corresponding rows.*/
   1.146 +#define OC_IDCT_8x8_D \
   1.147 +  "#OC_IDCT_8x8_D\n\t" \
   1.148 +  /*Stage 4: \
   1.149 +    0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
   1.150 +    1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
   1.151 +    2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
   1.152 +    3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
   1.153 +  "psubw %%xmm0,%%xmm7\n\t" \
   1.154 +  "psubw %%xmm1,%%xmm6\n\t" \
   1.155 +  "psubw %%xmm2,%%xmm5\n\t" \
   1.156 +  "psubw %%xmm3,%%xmm4\n\t" \
   1.157 +  "paddw %%xmm0,%%xmm0\n\t" \
   1.158 +  "paddw %%xmm1,%%xmm1\n\t" \
   1.159 +  "paddw %%xmm2,%%xmm2\n\t" \
   1.160 +  "paddw %%xmm3,%%xmm3\n\t" \
   1.161 +  "paddw %%xmm7,%%xmm0\n\t" \
   1.162 +  "paddw %%xmm6,%%xmm1\n\t" \
   1.163 +  "paddw %%xmm5,%%xmm2\n\t" \
   1.164 +  "paddw %%xmm4,%%xmm3\n\t" \
   1.165 +
   1.166 +/*Performs the last stage of the iDCT.
   1.167 +  On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
   1.168 +   contain rows 4 through 7.
   1.169 +  On output, xmm0 through xmm7 contain the corresponding rows.*/
   1.170 +#define OC_IDCT_8x8_D_STORE \
   1.171 +  "#OC_IDCT_8x8_D_STORE\n\t" \
   1.172 +  /*Stage 4: \
   1.173 +    0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
   1.174 +    1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
   1.175 +    2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
   1.176 +    3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
   1.177 +  "psubw %%xmm3,%%xmm4\n\t" \
   1.178 +  "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
   1.179 +  "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \
   1.180 +  "psubw %%xmm0,%%xmm7\n\t" \
   1.181 +  "psubw %%xmm1,%%xmm6\n\t" \
   1.182 +  "psubw %%xmm2,%%xmm5\n\t" \
   1.183 +  "paddw %%xmm4,%%xmm7\n\t" \
   1.184 +  "paddw %%xmm4,%%xmm6\n\t" \
   1.185 +  "paddw %%xmm4,%%xmm5\n\t" \
   1.186 +  "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \
   1.187 +  "paddw %%xmm0,%%xmm0\n\t" \
   1.188 +  "paddw %%xmm1,%%xmm1\n\t" \
   1.189 +  "paddw %%xmm2,%%xmm2\n\t" \
   1.190 +  "paddw %%xmm3,%%xmm3\n\t" \
   1.191 +  "paddw %%xmm7,%%xmm0\n\t" \
   1.192 +  "paddw %%xmm6,%%xmm1\n\t" \
   1.193 +  "psraw $4,%%xmm0\n\t" \
   1.194 +  "paddw %%xmm5,%%xmm2\n\t" \
   1.195 +  "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \
   1.196 +  "psraw $4,%%xmm1\n\t" \
   1.197 +  "paddw %%xmm4,%%xmm3\n\t" \
   1.198 +  "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \
   1.199 +  "psraw $4,%%xmm2\n\t" \
   1.200 +  "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \
   1.201 +  "psraw $4,%%xmm3\n\t" \
   1.202 +  "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \
   1.203 +  "psraw $4,%%xmm4\n\t" \
   1.204 +  "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
   1.205 +  "psraw $4,%%xmm5\n\t" \
   1.206 +  "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \
   1.207 +  "psraw $4,%%xmm6\n\t" \
   1.208 +  "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \
   1.209 +  "psraw $4,%%xmm7\n\t" \
   1.210 +  "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
   1.211 +
   1.212 +static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
   1.213 +  OC_ALIGN16(ogg_int16_t buf[16]);
   1.214 +  /*This routine accepts an 8x8 matrix pre-transposed.*/
   1.215 +  __asm__ __volatile__(
   1.216 +    /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
   1.217 +    "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t"
   1.218 +    "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t"
   1.219 +    "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t"
   1.220 +    "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t"
   1.221 +    OC_IDCT_8x8_ABC(x)
   1.222 +    OC_IDCT_8x8_D
   1.223 +    OC_TRANSPOSE_8x8
   1.224 +    /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/
   1.225 +    "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t"
   1.226 +    "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t"
   1.227 +    "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t"
   1.228 +    "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t"
   1.229 +    OC_IDCT_8x8_ABC(y)
   1.230 +    OC_IDCT_8x8_D_STORE
   1.231 +    :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)),
   1.232 +     [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
   1.233 +    :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
   1.234 +     [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
   1.235 +  );
   1.236 +  if(_x!=_y){
   1.237 +    int i;
   1.238 +    __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
   1.239 +    /*Clear input data for next block (decoder only).*/
   1.240 +    for(i=0;i<2;i++){
   1.241 +      __asm__ __volatile__(
   1.242 +        "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
   1.243 +        "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
   1.244 +        "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
   1.245 +        "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
   1.246 +        :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
   1.247 +      );
   1.248 +    }
   1.249 +  }
   1.250 +}
   1.251 +
   1.252 +/*For the first step of the 10-coefficient version of the 8x8 iDCT, we only
   1.253 +   need to work with four columns at a time.
   1.254 +  Doing this in MMX is faster on processors with a 64-bit data path.*/
   1.255 +#define OC_IDCT_8x8_10_MMX \
   1.256 +  "#OC_IDCT_8x8_10_MMX\n\t" \
   1.257 +  /*Stage 1:*/ \
   1.258 +  /*2-3 rotation by 6pi/16. \
   1.259 +    mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \
   1.260 +  "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
   1.261 +  "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \
   1.262 +  "pmulhw %%mm2,%%mm6\n\t" \
   1.263 +  "pmulhw %%mm2,%%mm7\n\t" \
   1.264 +  "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \
   1.265 +  "paddw %%mm6,%%mm2\n\t" \
   1.266 +  "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
   1.267 +  "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \
   1.268 +  "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
   1.269 +  /*5-6 rotation by 3pi/16. \
   1.270 +    mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \
   1.271 +  "pmulhw %%mm3,%%mm5\n\t" \
   1.272 +  "pmulhw %%mm3,%%mm2\n\t" \
   1.273 +  "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \
   1.274 +  "paddw %%mm3,%%mm5\n\t" \
   1.275 +  "paddw %%mm3,%%mm2\n\t" \
   1.276 +  "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
   1.277 +  /*4-7 rotation by 7pi/16. \
   1.278 +    mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \
   1.279 +  "pmulhw %%mm1,%%mm3\n\t" \
   1.280 +  "pmulhw %%mm1,%%mm7\n\t" \
   1.281 +  "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
   1.282 +  "movq %%mm3,%%mm6\n\t" \
   1.283 +  "paddw %%mm1,%%mm7\n\t" \
   1.284 +  /*0-1 butterfly. \
   1.285 +    mm4=C4, mm0=X0, X4=0.*/ \
   1.286 +  /*Stage 2:*/ \
   1.287 +  /*4-5 butterfly: mm3=t[4], mm5=t[5] \
   1.288 +    7-6 butterfly: mm2=t[6], mm7=t[7]*/ \
   1.289 +  "psubw %%mm5,%%mm3\n\t" \
   1.290 +  "paddw %%mm5,%%mm6\n\t" \
   1.291 +  "movq %%mm4,%%mm1\n\t" \
   1.292 +  "pmulhw %%mm0,%%mm4\n\t" \
   1.293 +  "paddw %%mm0,%%mm4\n\t" \
   1.294 +  "movq %%mm7,%%mm0\n\t" \
   1.295 +  "movq %%mm4,%%mm5\n\t" \
   1.296 +  "paddw %%mm2,%%mm0\n\t" \
   1.297 +  "psubw %%mm2,%%mm7\n\t" \
   1.298 +  "movq %%mm1,%%mm2\n\t" \
   1.299 +  "pmulhw %%mm6,%%mm1\n\t" \
   1.300 +  "pmulhw %%mm7,%%mm2\n\t" \
   1.301 +  "paddw %%mm6,%%mm1\n\t" \
   1.302 +  "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \
   1.303 +  "paddw %%mm7,%%mm2\n\t" \
   1.304 +  "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \
   1.305 +  /*Stage 3: \
   1.306 +    6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \
   1.307 +    0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \
   1.308 +    1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \
   1.309 +  "paddw %%mm2,%%mm1\n\t" \
   1.310 +  "paddw %%mm5,%%mm6\n\t" \
   1.311 +  "paddw %%mm4,%%mm7\n\t" \
   1.312 +  "paddw %%mm2,%%mm2\n\t" \
   1.313 +  "paddw %%mm4,%%mm4\n\t" \
   1.314 +  "paddw %%mm5,%%mm5\n\t" \
   1.315 +  "psubw %%mm1,%%mm2\n\t" \
   1.316 +  "psubw %%mm7,%%mm4\n\t" \
   1.317 +  "psubw %%mm6,%%mm5\n\t" \
   1.318 +  /*Stage 4: \
   1.319 +    0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \
   1.320 +    1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \
   1.321 +    2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \
   1.322 +    3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \
   1.323 +  "psubw %%mm0,%%mm7\n\t" \
   1.324 +  "psubw %%mm1,%%mm6\n\t" \
   1.325 +  "psubw %%mm2,%%mm5\n\t" \
   1.326 +  "psubw %%mm3,%%mm4\n\t" \
   1.327 +  "paddw %%mm0,%%mm0\n\t" \
   1.328 +  "paddw %%mm1,%%mm1\n\t" \
   1.329 +  "paddw %%mm2,%%mm2\n\t" \
   1.330 +  "paddw %%mm3,%%mm3\n\t" \
   1.331 +  "paddw %%mm7,%%mm0\n\t" \
   1.332 +  "paddw %%mm6,%%mm1\n\t" \
   1.333 +  "paddw %%mm5,%%mm2\n\t" \
   1.334 +  "paddw %%mm4,%%mm3\n\t" \
   1.335 +
   1.336 +#define OC_IDCT_8x8_10_ABC \
   1.337 +  "#OC_IDCT_8x8_10_ABC\n\t" \
   1.338 +  /*Stage 1:*/ \
   1.339 +  /*2-3 rotation by 6pi/16. \
   1.340 +    xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \
   1.341 +  "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \
   1.342 +  "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \
   1.343 +  "pmulhw %%xmm2,%%xmm6\n\t" \
   1.344 +  "pmulhw %%xmm2,%%xmm7\n\t" \
   1.345 +  "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \
   1.346 +  "paddw %%xmm6,%%xmm2\n\t" \
   1.347 +  "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
   1.348 +  "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \
   1.349 +  "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
   1.350 +  /*5-6 rotation by 3pi/16. \
   1.351 +    xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \
   1.352 +  "pmulhw %%xmm3,%%xmm5\n\t" \
   1.353 +  "pmulhw %%xmm3,%%xmm2\n\t" \
   1.354 +  "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \
   1.355 +  "paddw %%xmm3,%%xmm5\n\t" \
   1.356 +  "paddw %%xmm3,%%xmm2\n\t" \
   1.357 +  "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
   1.358 +  /*4-7 rotation by 7pi/16. \
   1.359 +    xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \
   1.360 +  "pmulhw %%xmm1,%%xmm3\n\t" \
   1.361 +  "pmulhw %%xmm1,%%xmm7\n\t" \
   1.362 +  "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
   1.363 +  "movdqa %%xmm3,%%xmm6\n\t" \
   1.364 +  "paddw %%xmm1,%%xmm7\n\t" \
   1.365 +  /*0-1 butterfly. \
   1.366 +    xmm4=C4, xmm0=X0, X4=0.*/ \
   1.367 +  /*Stage 2:*/ \
   1.368 +  /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \
   1.369 +    7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \
   1.370 +  "psubw %%xmm5,%%xmm3\n\t" \
   1.371 +  "paddw %%xmm5,%%xmm6\n\t" \
   1.372 +  "movdqa %%xmm4,%%xmm1\n\t" \
   1.373 +  "pmulhw %%xmm0,%%xmm4\n\t" \
   1.374 +  "paddw %%xmm0,%%xmm4\n\t" \
   1.375 +  "movdqa %%xmm7,%%xmm0\n\t" \
   1.376 +  "movdqa %%xmm4,%%xmm5\n\t" \
   1.377 +  "paddw %%xmm2,%%xmm0\n\t" \
   1.378 +  "psubw %%xmm2,%%xmm7\n\t" \
   1.379 +  "movdqa %%xmm1,%%xmm2\n\t" \
   1.380 +  "pmulhw %%xmm6,%%xmm1\n\t" \
   1.381 +  "pmulhw %%xmm7,%%xmm2\n\t" \
   1.382 +  "paddw %%xmm6,%%xmm1\n\t" \
   1.383 +  "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
   1.384 +  "paddw %%xmm7,%%xmm2\n\t" \
   1.385 +  "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
   1.386 +  /*Stage 3: \
   1.387 +    6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
   1.388 +    0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
   1.389 +    1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
   1.390 +  "paddw %%xmm2,%%xmm1\n\t" \
   1.391 +  "paddw %%xmm5,%%xmm6\n\t" \
   1.392 +  "paddw %%xmm4,%%xmm7\n\t" \
   1.393 +  "paddw %%xmm2,%%xmm2\n\t" \
   1.394 +  "paddw %%xmm4,%%xmm4\n\t" \
   1.395 +  "paddw %%xmm5,%%xmm5\n\t" \
   1.396 +  "psubw %%xmm1,%%xmm2\n\t" \
   1.397 +  "psubw %%xmm7,%%xmm4\n\t" \
   1.398 +  "psubw %%xmm6,%%xmm5\n\t" \
   1.399 +
   1.400 +static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
   1.401 +  OC_ALIGN16(ogg_int16_t buf[16]);
   1.402 +  /*This routine accepts an 8x8 matrix pre-transposed.*/
   1.403 +  __asm__ __volatile__(
   1.404 +    "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t"
   1.405 +    "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t"
   1.406 +    "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t"
   1.407 +    "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t"
   1.408 +    OC_IDCT_8x8_10_MMX
   1.409 +    OC_TRANSPOSE_8x4_MMX2SSE
   1.410 +    OC_IDCT_8x8_10_ABC
   1.411 +    OC_IDCT_8x8_D_STORE
   1.412 +    :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)),
   1.413 +     [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
   1.414 +    :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
   1.415 +     [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
   1.416 +  );
   1.417 +  if(_x!=_y){
   1.418 +    /*Clear input data for next block (decoder only).*/
   1.419 +    __asm__ __volatile__(
   1.420 +      "pxor %%mm0,%%mm0\n\t"
   1.421 +      "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
   1.422 +      "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
   1.423 +      "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
   1.424 +      "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
   1.425 +      :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
   1.426 +    );
   1.427 +  }
   1.428 +}
   1.429 +
   1.430 +/*Performs an inverse 8x8 Type-II DCT transform.
   1.431 +  The input is assumed to be scaled by a factor of 4 relative to orthonormal
   1.432 +   version of the transform.*/
   1.433 +void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
   1.434 +  /*_last_zzi is subtly different from an actual count of the number of
   1.435 +     coefficients we decoded for this block.
   1.436 +    It contains the value of zzi BEFORE the final token in the block was
   1.437 +     decoded.
   1.438 +    In most cases this is an EOB token (the continuation of an EOB run from a
   1.439 +     previous block counts), and so this is the same as the coefficient count.
   1.440 +    However, in the case that the last token was NOT an EOB token, but filled
   1.441 +     the block up with exactly 64 coefficients, _last_zzi will be less than 64.
   1.442 +    Provided the last token was not a pure zero run, the minimum value it can
   1.443 +     be is 46, and so that doesn't affect any of the cases in this routine.
   1.444 +    However, if the last token WAS a pure zero run of length 63, then _last_zzi
   1.445 +     will be 1 while the number of coefficients decoded is 64.
   1.446 +    Thus, we will trigger the following special case, where the real
   1.447 +     coefficient count would not.
   1.448 +    Note also that a zero run of length 64 will give _last_zzi a value of 0,
   1.449 +     but we still process the DC coefficient, which might have a non-zero value
   1.450 +     due to DC prediction.
   1.451 +    Although convoluted, this is arguably the correct behavior: it allows us to
   1.452 +     use a smaller transform when the block ends with a long zero run instead
   1.453 +     of a normal EOB token.
   1.454 +    It could be smarter... multiple separate zero runs at the end of a block
   1.455 +     will fool it, but an encoder that generates these really deserves what it
   1.456 +     gets.
   1.457 +    Needless to say we inherited this approach from VP3.*/
   1.458 +  /*Then perform the iDCT.*/
   1.459 +  if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x);
   1.460 +  else oc_idct8x8_slow_sse2(_y,_x);
   1.461 +}
   1.462 +
   1.463 +#endif

mercurial