media/libtheora/lib/x86/mmxloop.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libtheora/lib/x86/mmxloop.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,318 @@
     1.4 +#if !defined(_x86_mmxloop_H)
     1.5 +# define _x86_mmxloop_H (1)
     1.6 +# include <stddef.h>
     1.7 +# include "x86int.h"
     1.8 +
     1.9 +#if defined(OC_X86_ASM)
    1.10 +
    1.11 +/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
    1.12 +  On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
    1.13 +   mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
    1.14 +#define OC_LOOP_FILTER8_MMX \
    1.15 +  "#OC_LOOP_FILTER8_MMX\n\t" \
    1.16 +  /*mm7=0*/ \
    1.17 +  "pxor %%mm7,%%mm7\n\t" \
    1.18 +  /*mm6:mm0={a0,...,a7}*/ \
    1.19 +  "movq %%mm0,%%mm6\n\t" \
    1.20 +  "punpcklbw %%mm7,%%mm0\n\t" \
    1.21 +  "punpckhbw %%mm7,%%mm6\n\t" \
    1.22 +  /*mm3:mm5={d0,...,d7}*/ \
    1.23 +  "movq %%mm3,%%mm5\n\t" \
    1.24 +  "punpcklbw %%mm7,%%mm3\n\t" \
    1.25 +  "punpckhbw %%mm7,%%mm5\n\t" \
    1.26 +  /*mm6:mm0={a0-d0,...,a7-d7}*/ \
    1.27 +  "psubw %%mm3,%%mm0\n\t" \
    1.28 +  "psubw %%mm5,%%mm6\n\t" \
    1.29 +  /*mm3:mm1={b0,...,b7}*/ \
    1.30 +  "movq %%mm1,%%mm3\n\t" \
    1.31 +  "punpcklbw %%mm7,%%mm1\n\t" \
    1.32 +  "movq %%mm2,%%mm4\n\t" \
    1.33 +  "punpckhbw %%mm7,%%mm3\n\t" \
    1.34 +  /*mm5:mm4={c0,...,c7}*/ \
    1.35 +  "movq %%mm2,%%mm5\n\t" \
    1.36 +  "punpcklbw %%mm7,%%mm4\n\t" \
    1.37 +  "punpckhbw %%mm7,%%mm5\n\t" \
    1.38 +  /*mm7={3}x4 \
    1.39 +    mm5:mm4={c0-b0,...,c7-b7}*/ \
    1.40 +  "pcmpeqw %%mm7,%%mm7\n\t" \
    1.41 +  "psubw %%mm1,%%mm4\n\t" \
    1.42 +  "psrlw $14,%%mm7\n\t" \
    1.43 +  "psubw %%mm3,%%mm5\n\t" \
    1.44 +  /*Scale by 3.*/ \
    1.45 +  "pmullw %%mm7,%%mm4\n\t" \
    1.46 +  "pmullw %%mm7,%%mm5\n\t" \
    1.47 +  /*mm7={4}x4 \
    1.48 +    mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
    1.49 +  "psrlw $1,%%mm7\n\t" \
    1.50 +  "paddw %%mm0,%%mm4\n\t" \
    1.51 +  "psllw $2,%%mm7\n\t" \
    1.52 +  "movq (%[ll]),%%mm0\n\t" \
    1.53 +  "paddw %%mm6,%%mm5\n\t" \
    1.54 +  /*R_i has the range [-127,128], so we compute -R_i instead. \
    1.55 +    mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
    1.56 +  "psubw %%mm7,%%mm4\n\t" \
    1.57 +  "psubw %%mm7,%%mm5\n\t" \
    1.58 +  "psraw $3,%%mm4\n\t" \
    1.59 +  "psraw $3,%%mm5\n\t" \
    1.60 +  "pcmpeqb %%mm7,%%mm7\n\t" \
    1.61 +  "packsswb %%mm5,%%mm4\n\t" \
    1.62 +  "pxor %%mm6,%%mm6\n\t" \
    1.63 +  "pxor %%mm7,%%mm4\n\t" \
    1.64 +  "packuswb %%mm3,%%mm1\n\t" \
    1.65 +  /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
    1.66 +  /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
    1.67 +     we have to split things by sign (the other option is to work in 16 bits, \
    1.68 +     but working in 8 bits gives much better parallelism). \
    1.69 +    We compute abs(R_i), but save a mask of which terms were negative in mm6. \
    1.70 +    Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
    1.71 +    Finally, we split mm4 into positive and negative pieces using the mask in \
    1.72 +     mm6, and add and subtract them as appropriate.*/ \
    1.73 +  /*mm4=abs(-R_i)*/ \
    1.74 +  /*mm7=255-2*L*/ \
    1.75 +  "pcmpgtb %%mm4,%%mm6\n\t" \
    1.76 +  "psubb %%mm0,%%mm7\n\t" \
    1.77 +  "pxor %%mm6,%%mm4\n\t" \
    1.78 +  "psubb %%mm0,%%mm7\n\t" \
    1.79 +  "psubb %%mm6,%%mm4\n\t" \
    1.80 +  /*mm7=255-max(2*L-abs(R_i),0)*/ \
    1.81 +  "paddusb %%mm4,%%mm7\n\t" \
    1.82 +  /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
    1.83 +  "paddusb %%mm7,%%mm4\n\t" \
    1.84 +  "psubusb %%mm7,%%mm4\n\t" \
    1.85 +  /*Now split mm4 by the original sign of -R_i.*/ \
    1.86 +  "movq %%mm4,%%mm5\n\t" \
    1.87 +  "pand %%mm6,%%mm4\n\t" \
    1.88 +  "pandn %%mm5,%%mm6\n\t" \
    1.89 +  /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
    1.90 +  /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
    1.91 +  "paddusb %%mm4,%%mm1\n\t" \
    1.92 +  "psubusb %%mm4,%%mm2\n\t" \
    1.93 +  "psubusb %%mm6,%%mm1\n\t" \
    1.94 +  "paddusb %%mm6,%%mm2\n\t" \
    1.95 +
    1.96 +/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
    1.97 +  On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
    1.98 +   mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}.
    1.99 +  All other MMX registers are clobbered.*/
   1.100 +#define OC_LOOP_FILTER8_MMXEXT \
   1.101 +  "#OC_LOOP_FILTER8_MMXEXT\n\t" \
   1.102 +  /*R_i=(a_i-3*b_i+3*c_i-d_i+4>>3) has the range [-127,128], so we compute \
   1.103 +     -R_i=(-a_i+3*b_i-3*c_i+d_i+3>>3) instead.*/ \
   1.104 +  /*This first part is based on the transformation \
   1.105 +      f = -(3*(c-b)+a-d+4>>3) \
   1.106 +        = -(3*(c+255-b)+(a+255-d)+4-1020>>3) \
   1.107 +        = -(3*(c+~b)+(a+~d)-1016>>3) \
   1.108 +        = 127-(3*(c+~b)+(a+~d)>>3) \
   1.109 +        = 128+~(3*(c+~b)+(a+~d)>>3) (mod 256). \
   1.110 +    Although pavgb(a,b) = (a+b+1>>1) (biased up), we rely heavily on the \
   1.111 +     fact that ~pavgb(~a,~b) = (a+b>>1) (biased down). \
   1.112 +    Using this, the last expression above can be computed in 8 bits of working \
   1.113 +     precision via: \
   1.114 +      u = ~pavgb(~b,c); \
   1.115 +      v = pavgb(b,~c); \
   1.116 +      This mask is 0 or 0xFF, and controls whether t is biased up or down: \
   1.117 +      m = u-v; \
   1.118 +      t = m^pavgb(m^~a,m^d); \
   1.119 +      f = 128+pavgb(pavgb(t,u),v); \
   1.120 +    This required some careful analysis to ensure that carries are propagated \
   1.121 +     correctly in all cases, but has been checked exhaustively.*/ \
   1.122 +  /*input (a, b, c, d, ., ., ., .)*/ \
   1.123 +  /*ff=0xFF; \
   1.124 +    u=b; \
   1.125 +    v=c; \
   1.126 +    ll=255-2*L;*/ \
   1.127 +  "pcmpeqb %%mm7,%%mm7\n\t" \
   1.128 +  "movq %%mm1,%%mm4\n\t" \
   1.129 +  "movq %%mm2,%%mm5\n\t" \
   1.130 +  "movq (%[ll]),%%mm6\n\t" \
   1.131 +  /*allocated u, v, ll, ff: (a, b, c, d, u, v, ll, ff)*/ \
   1.132 +  /*u^=ff; \
   1.133 +    v^=ff;*/ \
   1.134 +  "pxor %%mm7,%%mm4\n\t" \
   1.135 +  "pxor %%mm7,%%mm5\n\t" \
   1.136 +  /*allocated ll: (a, b, c, d, u, v, ll, ff)*/ \
   1.137 +  /*u=pavgb(u,c); \
   1.138 +    v=pavgb(v,b);*/ \
   1.139 +  "pavgb %%mm2,%%mm4\n\t" \
   1.140 +  "pavgb %%mm1,%%mm5\n\t" \
   1.141 +  /*u^=ff; \
   1.142 +    a^=ff;*/ \
   1.143 +  "pxor %%mm7,%%mm4\n\t" \
   1.144 +  "pxor %%mm7,%%mm0\n\t" \
   1.145 +  /*m=u-v;*/ \
   1.146 +  "psubb %%mm5,%%mm4\n\t" \
   1.147 +  /*freed u, allocated m: (a, b, c, d, m, v, ll, ff)*/ \
   1.148 +  /*a^=m; \
   1.149 +    d^=m;*/ \
   1.150 +  "pxor %%mm4,%%mm0\n\t" \
   1.151 +  "pxor %%mm4,%%mm3\n\t" \
   1.152 +  /*t=pavgb(a,d);*/ \
   1.153 +  "pavgb %%mm3,%%mm0\n\t" \
   1.154 +  "psllw $7,%%mm7\n\t" \
   1.155 +  /*freed a, d, ff, allocated t, of: (t, b, c, ., m, v, ll, of)*/ \
   1.156 +  /*t^=m; \
   1.157 +    u=m+v;*/ \
   1.158 +  "pxor %%mm4,%%mm0\n\t" \
   1.159 +  "paddb %%mm5,%%mm4\n\t" \
   1.160 +  /*freed t, m, allocated f, u: (f, b, c, ., u, v, ll, of)*/ \
   1.161 +  /*f=pavgb(f,u); \
   1.162 +    of=128;*/ \
   1.163 +  "pavgb %%mm4,%%mm0\n\t" \
   1.164 +  "packsswb %%mm7,%%mm7\n\t" \
   1.165 +  /*freed u, ff, allocated ll: (f, b, c, ., ll, v, ll, of)*/ \
   1.166 +  /*f=pavgb(f,v);*/ \
   1.167 +  "pavgb %%mm5,%%mm0\n\t" \
   1.168 +  "movq %%mm7,%%mm3\n\t" \
   1.169 +  "movq %%mm6,%%mm4\n\t" \
   1.170 +  /*freed v, allocated of: (f, b, c, of, ll, ., ll, of)*/ \
   1.171 +  /*Now compute lflim of R_i=-(128+mm0) cf. Section 7.10 of the sepc.*/ \
   1.172 +  /*There's no unsigned byte+signed byte with unsigned saturation op code, so \
   1.173 +     we have to split things by sign (the other option is to work in 16 bits, \
   1.174 +     but staying in 8 bits gives much better parallelism).*/ \
   1.175 +  /*Instead of adding the offset of 128 in mm3, we use it to split mm0. \
   1.176 +    This is the same number of instructions as computing a mask and splitting \
   1.177 +     after the lflim computation, but has shorter dependency chains.*/ \
   1.178 +  /*mm0=R_i<0?-R_i:0 (denoted abs(R_i<0))\
   1.179 +    mm3=R_i>0?R_i:0* (denoted abs(R_i>0))*/ \
   1.180 +  "psubusb %%mm0,%%mm3\n\t" \
   1.181 +  "psubusb %%mm7,%%mm0\n\t" \
   1.182 +  /*mm6=255-max(2*L-abs(R_i<0),0) \
   1.183 +    mm4=255-max(2*L-abs(R_i>0),0)*/ \
   1.184 +  "paddusb %%mm3,%%mm4\n\t" \
   1.185 +  "paddusb %%mm0,%%mm6\n\t" \
   1.186 +  /*mm0=min(abs(R_i<0),max(2*L-abs(R_i<0),0)) \
   1.187 +    mm3=min(abs(R_i>0),max(2*L-abs(R_i>0),0))*/ \
   1.188 +  "paddusb %%mm4,%%mm3\n\t" \
   1.189 +  "paddusb %%mm6,%%mm0\n\t" \
   1.190 +  "psubusb %%mm4,%%mm3\n\t" \
   1.191 +  "psubusb %%mm6,%%mm0\n\t" \
   1.192 +  /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
   1.193 +  /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
   1.194 +  "paddusb %%mm3,%%mm1\n\t" \
   1.195 +  "psubusb %%mm3,%%mm2\n\t" \
   1.196 +  "psubusb %%mm0,%%mm1\n\t" \
   1.197 +  "paddusb %%mm0,%%mm2\n\t" \
   1.198 +
   1.199 +#define OC_LOOP_FILTER_V(_filter,_pix,_ystride,_ll) \
   1.200 +  do{ \
   1.201 +    ptrdiff_t ystride3__; \
   1.202 +    __asm__ __volatile__( \
   1.203 +      /*mm0={a0,...,a7}*/ \
   1.204 +      "movq (%[pix]),%%mm0\n\t" \
   1.205 +      /*ystride3=_ystride*3*/ \
   1.206 +      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
   1.207 +      /*mm3={d0,...,d7}*/ \
   1.208 +      "movq (%[pix],%[ystride3]),%%mm3\n\t" \
   1.209 +      /*mm1={b0,...,b7}*/ \
   1.210 +      "movq (%[pix],%[ystride]),%%mm1\n\t" \
   1.211 +      /*mm2={c0,...,c7}*/ \
   1.212 +      "movq (%[pix],%[ystride],2),%%mm2\n\t" \
   1.213 +      _filter \
   1.214 +      /*Write it back out.*/ \
   1.215 +      "movq %%mm1,(%[pix],%[ystride])\n\t" \
   1.216 +      "movq %%mm2,(%[pix],%[ystride],2)\n\t" \
   1.217 +      :[ystride3]"=&r"(ystride3__) \
   1.218 +      :[pix]"r"(_pix-_ystride*2),[ystride]"r"((ptrdiff_t)(_ystride)), \
   1.219 +       [ll]"r"(_ll) \
   1.220 +      :"memory" \
   1.221 +    ); \
   1.222 +  } \
   1.223 +  while(0)
   1.224 +
   1.225 +#define OC_LOOP_FILTER_H(_filter,_pix,_ystride,_ll) \
   1.226 +  do{ \
   1.227 +    unsigned char *pix__; \
   1.228 +    ptrdiff_t      ystride3__; \
   1.229 +    ptrdiff_t      d__; \
   1.230 +    pix__=(_pix)-2; \
   1.231 +    __asm__ __volatile__( \
   1.232 +      /*x x x x d0 c0 b0 a0*/ \
   1.233 +      "movd (%[pix]),%%mm0\n\t" \
   1.234 +      /*x x x x d1 c1 b1 a1*/ \
   1.235 +      "movd (%[pix],%[ystride]),%%mm1\n\t" \
   1.236 +      /*ystride3=_ystride*3*/ \
   1.237 +      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
   1.238 +      /*x x x x d2 c2 b2 a2*/ \
   1.239 +      "movd (%[pix],%[ystride],2),%%mm2\n\t" \
   1.240 +      /*x x x x d3 c3 b3 a3*/ \
   1.241 +      "lea (%[pix],%[ystride],4),%[d]\n\t" \
   1.242 +      "movd (%[pix],%[ystride3]),%%mm3\n\t" \
   1.243 +      /*x x x x d4 c4 b4 a4*/ \
   1.244 +      "movd (%[d]),%%mm4\n\t" \
   1.245 +      /*x x x x d5 c5 b5 a5*/ \
   1.246 +      "movd (%[d],%[ystride]),%%mm5\n\t" \
   1.247 +      /*x x x x d6 c6 b6 a6*/ \
   1.248 +      "movd (%[d],%[ystride],2),%%mm6\n\t" \
   1.249 +      /*x x x x d7 c7 b7 a7*/ \
   1.250 +      "movd (%[d],%[ystride3]),%%mm7\n\t" \
   1.251 +      /*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \
   1.252 +      "punpcklbw %%mm1,%%mm0\n\t" \
   1.253 +      /*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \
   1.254 +      "punpcklbw %%mm3,%%mm2\n\t" \
   1.255 +      /*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \
   1.256 +      "movq %%mm0,%%mm3\n\t" \
   1.257 +      /*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \
   1.258 +      "punpcklwd %%mm2,%%mm0\n\t" \
   1.259 +      /*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \
   1.260 +      "punpckhwd %%mm2,%%mm3\n\t" \
   1.261 +      /*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \
   1.262 +      "movq %%mm0,%%mm1\n\t" \
   1.263 +      /*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \
   1.264 +      "punpcklbw %%mm5,%%mm4\n\t" \
   1.265 +      /*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \
   1.266 +      "punpcklbw %%mm7,%%mm6\n\t" \
   1.267 +      /*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \
   1.268 +      "movq %%mm4,%%mm5\n\t" \
   1.269 +      /*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \
   1.270 +      "punpcklwd %%mm6,%%mm4\n\t" \
   1.271 +      /*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \
   1.272 +      "punpckhwd %%mm6,%%mm5\n\t" \
   1.273 +      /*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \
   1.274 +      "movq %%mm3,%%mm2\n\t" \
   1.275 +      /*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \
   1.276 +      "punpckldq %%mm4,%%mm0\n\t" \
   1.277 +      /*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \
   1.278 +      "punpckhdq %%mm4,%%mm1\n\t" \
   1.279 +      /*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \
   1.280 +      "punpckldq %%mm5,%%mm2\n\t" \
   1.281 +      /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
   1.282 +      "punpckhdq %%mm5,%%mm3\n\t" \
   1.283 +      _filter \
   1.284 +      /*mm2={b0+R_0'',...,b7+R_7''}*/ \
   1.285 +      "movq %%mm1,%%mm0\n\t" \
   1.286 +      /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
   1.287 +      "punpcklbw %%mm2,%%mm1\n\t" \
   1.288 +      /*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \
   1.289 +      "punpckhbw %%mm2,%%mm0\n\t" \
   1.290 +      /*[d]=c1 b1 c0 b0*/ \
   1.291 +      "movd %%mm1,%[d]\n\t" \
   1.292 +      "movw %w[d],1(%[pix])\n\t" \
   1.293 +      "psrlq $32,%%mm1\n\t" \
   1.294 +      "shr $16,%[d]\n\t" \
   1.295 +      "movw %w[d],1(%[pix],%[ystride])\n\t" \
   1.296 +      /*[d]=c3 b3 c2 b2*/ \
   1.297 +      "movd %%mm1,%[d]\n\t" \
   1.298 +      "movw %w[d],1(%[pix],%[ystride],2)\n\t" \
   1.299 +      "shr $16,%[d]\n\t" \
   1.300 +      "movw %w[d],1(%[pix],%[ystride3])\n\t" \
   1.301 +      "lea (%[pix],%[ystride],4),%[pix]\n\t" \
   1.302 +      /*[d]=c5 b5 c4 b4*/ \
   1.303 +      "movd %%mm0,%[d]\n\t" \
   1.304 +      "movw %w[d],1(%[pix])\n\t" \
   1.305 +      "psrlq $32,%%mm0\n\t" \
   1.306 +      "shr $16,%[d]\n\t" \
   1.307 +      "movw %w[d],1(%[pix],%[ystride])\n\t" \
   1.308 +      /*[d]=c7 b7 c6 b6*/ \
   1.309 +      "movd %%mm0,%[d]\n\t" \
   1.310 +      "movw %w[d],1(%[pix],%[ystride],2)\n\t" \
   1.311 +      "shr $16,%[d]\n\t" \
   1.312 +      "movw %w[d],1(%[pix],%[ystride3])\n\t" \
   1.313 +      :[pix]"+r"(pix__),[ystride3]"=&r"(ystride3__),[d]"=&r"(d__) \
   1.314 +      :[ystride]"r"((ptrdiff_t)(_ystride)),[ll]"r"(_ll) \
   1.315 +      :"memory" \
   1.316 +    ); \
   1.317 +  } \
   1.318 +  while(0)
   1.319 +
   1.320 +# endif
   1.321 +#endif

mercurial