media/libtheora/lib/x86/sse2trans.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libtheora/lib/x86/sse2trans.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,242 @@
     1.4 +/********************************************************************
     1.5 + *                                                                  *
     1.6 + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
     1.7 + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
     1.8 + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
     1.9 + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
    1.10 + *                                                                  *
    1.11 + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
    1.12 + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
    1.13 + *                                                                  *
    1.14 + ********************************************************************
    1.15 +
    1.16 +  function:
    1.17 +    last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
    1.18 +
    1.19 + ********************************************************************/
    1.20 +
    1.21 +#if !defined(_x86_sse2trans_H)
    1.22 +# define _x86_sse2trans_H (1)
    1.23 +# include "x86int.h"
    1.24 +
    1.25 +# if defined(OC_X86_64_ASM)
    1.26 +/*On x86-64 we can transpose in-place without spilling registers.
    1.27 +  By clever choices of the order to apply the butterflies and the order of
    1.28 +   their outputs, we can take the rows in order and output the columns in order
    1.29 +   without any extra operations and using just one temporary register.*/
    1.30 +#  define OC_TRANSPOSE_8x8 \
    1.31 + "#OC_TRANSPOSE_8x8\n\t" \
    1.32 + "movdqa %%xmm4,%%xmm8\n\t" \
    1.33 + /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
    1.34 + "punpcklwd %%xmm5,%%xmm4\n\t" \
    1.35 + /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
    1.36 + "punpckhwd %%xmm5,%%xmm8\n\t" \
    1.37 + /*xmm5 is free.*/ \
    1.38 + "movdqa %%xmm0,%%xmm5\n\t" \
    1.39 + /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
    1.40 + "punpcklwd %%xmm1,%%xmm0\n\t" \
    1.41 + /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
    1.42 + "punpckhwd %%xmm1,%%xmm5\n\t" \
    1.43 + /*xmm1 is free.*/ \
    1.44 + "movdqa %%xmm6,%%xmm1\n\t" \
    1.45 + /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
    1.46 + "punpcklwd %%xmm7,%%xmm6\n\t" \
    1.47 + /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
    1.48 + "punpckhwd %%xmm7,%%xmm1\n\t" \
    1.49 + /*xmm7 is free.*/ \
    1.50 + "movdqa %%xmm2,%%xmm7\n\t" \
    1.51 + /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
    1.52 + "punpckhwd %%xmm3,%%xmm2\n\t" \
    1.53 + /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
    1.54 + "punpcklwd %%xmm3,%%xmm7\n\t" \
    1.55 + /*xmm3 is free.*/ \
    1.56 + "movdqa %%xmm0,%%xmm3\n\t" \
    1.57 + /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
    1.58 + "punpckldq %%xmm7,%%xmm0\n\t" \
    1.59 + /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
    1.60 + "punpckhdq %%xmm7,%%xmm3\n\t" \
    1.61 + /*xmm7 is free.*/ \
    1.62 + "movdqa %%xmm5,%%xmm7\n\t" \
    1.63 + /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
    1.64 + "punpckldq %%xmm2,%%xmm5\n\t" \
    1.65 + /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
    1.66 + "punpckhdq %%xmm2,%%xmm7\n\t" \
    1.67 + /*xmm2 is free.*/ \
    1.68 + "movdqa %%xmm4,%%xmm2\n\t" \
    1.69 + /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
    1.70 + "punpckhdq %%xmm6,%%xmm4\n\t" \
    1.71 + /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
    1.72 + "punpckldq %%xmm6,%%xmm2\n\t" \
    1.73 + /*xmm6 is free.*/ \
    1.74 + "movdqa %%xmm8,%%xmm6\n\t" \
    1.75 + /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
    1.76 + "punpckldq %%xmm1,%%xmm6\n\t" \
    1.77 + /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
    1.78 + "punpckhdq %%xmm1,%%xmm8\n\t" \
    1.79 + /*xmm1 is free.*/ \
    1.80 + "movdqa %%xmm0,%%xmm1\n\t" \
    1.81 + /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
    1.82 + "punpcklqdq %%xmm2,%%xmm0\n\t" \
    1.83 + /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
    1.84 + "punpckhqdq %%xmm2,%%xmm1\n\t" \
    1.85 + /*xmm2 is free.*/ \
    1.86 + "movdqa %%xmm3,%%xmm2\n\t" \
    1.87 + /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
    1.88 + "punpckhqdq %%xmm4,%%xmm3\n\t" \
    1.89 + /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
    1.90 + "punpcklqdq %%xmm4,%%xmm2\n\t" \
    1.91 + /*xmm4 is free.*/ \
    1.92 + "movdqa %%xmm5,%%xmm4\n\t" \
    1.93 + /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
    1.94 + "punpckhqdq %%xmm6,%%xmm5\n\t" \
    1.95 + /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
    1.96 + "punpcklqdq %%xmm6,%%xmm4\n\t" \
    1.97 + /*xmm6 is free.*/ \
    1.98 + "movdqa %%xmm7,%%xmm6\n\t" \
    1.99 + /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
   1.100 + "punpckhqdq %%xmm8,%%xmm7\n\t" \
   1.101 + /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
   1.102 + "punpcklqdq %%xmm8,%%xmm6\n\t" \
   1.103 + /*xmm8 is free.*/ \
   1.104 +
   1.105 +# else
   1.106 +/*Otherwise, we need to spill some values to %[buf] temporarily.
   1.107 +  Again, the butterflies are carefully arranged to get the columns to come out
   1.108 +   in order, minimizing register spills and maximizing the delay between a load
   1.109 +   and when the value loaded is actually used.*/
   1.110 +#  define OC_TRANSPOSE_8x8 \
   1.111 + "#OC_TRANSPOSE_8x8\n\t" \
   1.112 + /*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \
   1.113 + "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
   1.114 + /*xmm0 is free.*/ \
   1.115 + "movdqa %%xmm2,%%xmm0\n\t" \
   1.116 + /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
   1.117 + "punpckhwd %%xmm3,%%xmm2\n\t" \
   1.118 + /*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
   1.119 + "punpcklwd %%xmm3,%%xmm0\n\t" \
   1.120 + /*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \
   1.121 + "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \
   1.122 + /*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \
   1.123 + "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
   1.124 + /*xmm2 is free.*/ \
   1.125 + "movdqa %%xmm6,%%xmm2\n\t" \
   1.126 + /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
   1.127 + "punpcklwd %%xmm7,%%xmm6\n\t" \
   1.128 + /*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
   1.129 + "punpckhwd %%xmm7,%%xmm2\n\t" \
   1.130 + /*xmm7 is free.*/ \
   1.131 + "movdqa %%xmm4,%%xmm7\n\t" \
   1.132 + /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
   1.133 + "punpcklwd %%xmm5,%%xmm4\n\t" \
   1.134 + /*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
   1.135 + "punpckhwd %%xmm5,%%xmm7\n\t" \
   1.136 + /*xmm5 is free.*/ \
   1.137 + "movdqa %%xmm3,%%xmm5\n\t" \
   1.138 + /*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
   1.139 + "punpcklwd %%xmm1,%%xmm3\n\t" \
   1.140 + /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
   1.141 + "punpckhwd %%xmm1,%%xmm5\n\t" \
   1.142 + /*xmm1 is free.*/ \
   1.143 + "movdqa %%xmm7,%%xmm1\n\t" \
   1.144 + /*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
   1.145 + "punpckldq %%xmm2,%%xmm7\n\t" \
   1.146 + /*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
   1.147 + "punpckhdq %%xmm2,%%xmm1\n\t" \
   1.148 + /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
   1.149 + "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \
   1.150 + /*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \
   1.151 + "movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \
   1.152 + /*xmm1 is free.*/ \
   1.153 + "movdqa %%xmm3,%%xmm1\n\t" \
   1.154 + /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
   1.155 + "punpckhdq %%xmm0,%%xmm3\n\t" \
   1.156 + /*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
   1.157 + "punpckldq %%xmm0,%%xmm1\n\t" \
   1.158 + /*xmm0 is free.*/ \
   1.159 + "movdqa %%xmm4,%%xmm0\n\t" \
   1.160 + /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
   1.161 + "punpckhdq %%xmm6,%%xmm4\n\t" \
   1.162 + /*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
   1.163 + "punpckldq %%xmm6,%%xmm0\n\t" \
   1.164 + /*xmm6 is free.*/ \
   1.165 + "movdqa %%xmm5,%%xmm6\n\t" \
   1.166 + /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
   1.167 + "punpckldq %%xmm2,%%xmm5\n\t" \
   1.168 + /*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
   1.169 + "punpckhdq %%xmm2,%%xmm6\n\t" \
   1.170 + /*xmm2 is free.*/ \
   1.171 + "movdqa %%xmm1,%%xmm2\n\t" \
   1.172 + /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
   1.173 + "punpckhqdq %%xmm0,%%xmm1\n\t" \
   1.174 + /*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
   1.175 + "punpcklqdq %%xmm0,%%xmm2\n\t" \
   1.176 + /*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
   1.177 + "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
   1.178 + /*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \
   1.179 + "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
   1.180 + /*xmm2 is free.*/ \
   1.181 + "movdqa %%xmm3,%%xmm2\n\t" \
   1.182 + /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
   1.183 + "punpckhqdq %%xmm4,%%xmm3\n\t" \
   1.184 + /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
   1.185 + "punpcklqdq %%xmm4,%%xmm2\n\t" \
   1.186 + /*xmm4 is free.*/ \
   1.187 + "movdqa %%xmm5,%%xmm4\n\t" \
   1.188 + /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
   1.189 + "punpckhqdq %%xmm7,%%xmm5\n\t" \
   1.190 + /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
   1.191 + "punpcklqdq %%xmm7,%%xmm4\n\t" \
   1.192 + /*xmm7 is free.*/ \
   1.193 + "movdqa %%xmm6,%%xmm7\n\t" \
   1.194 + /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
   1.195 + "punpcklqdq %%xmm0,%%xmm6\n\t" \
   1.196 + /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
   1.197 + "punpckhqdq %%xmm0,%%xmm7\n\t" \
   1.198 + /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
   1.199 + "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \
   1.200 +
   1.201 +# endif
   1.202 +
   1.203 +/*Transpose 4 values in each of 8 MMX registers into 8 values in the first
   1.204 +   four SSE registers.
   1.205 +  No need to be clever here; we have plenty of room.*/
   1.206 +#  define OC_TRANSPOSE_8x4_MMX2SSE \
   1.207 + "#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \
   1.208 + "movq2dq %%mm0,%%xmm0\n\t" \
   1.209 + "movq2dq %%mm1,%%xmm1\n\t" \
   1.210 + /*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \
   1.211 + "punpcklwd %%xmm1,%%xmm0\n\t" \
   1.212 + "movq2dq %%mm2,%%xmm3\n\t" \
   1.213 + "movq2dq %%mm3,%%xmm2\n\t" \
   1.214 + /*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \
   1.215 + "punpcklwd %%xmm2,%%xmm3\n\t" \
   1.216 + "movq2dq %%mm4,%%xmm4\n\t" \
   1.217 + "movq2dq %%mm5,%%xmm5\n\t" \
   1.218 + /*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \
   1.219 + "punpcklwd %%xmm5,%%xmm4\n\t" \
   1.220 + "movq2dq %%mm6,%%xmm7\n\t" \
   1.221 + "movq2dq %%mm7,%%xmm6\n\t" \
   1.222 + /*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \
   1.223 + "punpcklwd %%xmm6,%%xmm7\n\t" \
   1.224 + "movdqa %%xmm0,%%xmm2\n\t" \
   1.225 + /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
   1.226 + "punpckldq %%xmm3,%%xmm0\n\t" \
   1.227 + /*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
   1.228 + "punpckhdq %%xmm3,%%xmm2\n\t" \
   1.229 + "movdqa %%xmm4,%%xmm5\n\t" \
   1.230 + /*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
   1.231 + "punpckldq %%xmm7,%%xmm4\n\t" \
   1.232 + /*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
   1.233 + "punpckhdq %%xmm7,%%xmm5\n\t" \
   1.234 + "movdqa %%xmm0,%%xmm1\n\t" \
   1.235 + /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
   1.236 + "punpcklqdq %%xmm4,%%xmm0\n\t" \
   1.237 + /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
   1.238 + "punpckhqdq %%xmm4,%%xmm1\n\t" \
   1.239 + "movdqa %%xmm2,%%xmm3\n\t" \
   1.240 + /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
   1.241 + "punpcklqdq %%xmm5,%%xmm2\n\t" \
   1.242 + /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
   1.243 + "punpckhqdq %%xmm5,%%xmm3\n\t" \
   1.244 +
   1.245 +#endif

mercurial