1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libtheora/lib/x86/mmxloop.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,318 @@ 1.4 +#if !defined(_x86_mmxloop_H) 1.5 +# define _x86_mmxloop_H (1) 1.6 +# include <stddef.h> 1.7 +# include "x86int.h" 1.8 + 1.9 +#if defined(OC_X86_ASM) 1.10 + 1.11 +/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}. 1.12 + On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and 1.13 + mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/ 1.14 +#define OC_LOOP_FILTER8_MMX \ 1.15 + "#OC_LOOP_FILTER8_MMX\n\t" \ 1.16 + /*mm7=0*/ \ 1.17 + "pxor %%mm7,%%mm7\n\t" \ 1.18 + /*mm6:mm0={a0,...,a7}*/ \ 1.19 + "movq %%mm0,%%mm6\n\t" \ 1.20 + "punpcklbw %%mm7,%%mm0\n\t" \ 1.21 + "punpckhbw %%mm7,%%mm6\n\t" \ 1.22 + /*mm3:mm5={d0,...,d7}*/ \ 1.23 + "movq %%mm3,%%mm5\n\t" \ 1.24 + "punpcklbw %%mm7,%%mm3\n\t" \ 1.25 + "punpckhbw %%mm7,%%mm5\n\t" \ 1.26 + /*mm6:mm0={a0-d0,...,a7-d7}*/ \ 1.27 + "psubw %%mm3,%%mm0\n\t" \ 1.28 + "psubw %%mm5,%%mm6\n\t" \ 1.29 + /*mm3:mm1={b0,...,b7}*/ \ 1.30 + "movq %%mm1,%%mm3\n\t" \ 1.31 + "punpcklbw %%mm7,%%mm1\n\t" \ 1.32 + "movq %%mm2,%%mm4\n\t" \ 1.33 + "punpckhbw %%mm7,%%mm3\n\t" \ 1.34 + /*mm5:mm4={c0,...,c7}*/ \ 1.35 + "movq %%mm2,%%mm5\n\t" \ 1.36 + "punpcklbw %%mm7,%%mm4\n\t" \ 1.37 + "punpckhbw %%mm7,%%mm5\n\t" \ 1.38 + /*mm7={3}x4 \ 1.39 + mm5:mm4={c0-b0,...,c7-b7}*/ \ 1.40 + "pcmpeqw %%mm7,%%mm7\n\t" \ 1.41 + "psubw %%mm1,%%mm4\n\t" \ 1.42 + "psrlw $14,%%mm7\n\t" \ 1.43 + "psubw %%mm3,%%mm5\n\t" \ 1.44 + /*Scale by 3.*/ \ 1.45 + "pmullw %%mm7,%%mm4\n\t" \ 1.46 + "pmullw %%mm7,%%mm5\n\t" \ 1.47 + /*mm7={4}x4 \ 1.48 + mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \ 1.49 + "psrlw $1,%%mm7\n\t" \ 1.50 + "paddw %%mm0,%%mm4\n\t" \ 1.51 + "psllw $2,%%mm7\n\t" \ 1.52 + "movq (%[ll]),%%mm0\n\t" \ 1.53 + "paddw %%mm6,%%mm5\n\t" \ 1.54 + /*R_i has the range [-127,128], so we compute -R_i instead. \ 1.55 + mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \ 1.56 + "psubw %%mm7,%%mm4\n\t" \ 1.57 + "psubw %%mm7,%%mm5\n\t" \ 1.58 + "psraw $3,%%mm4\n\t" \ 1.59 + "psraw $3,%%mm5\n\t" \ 1.60 + "pcmpeqb %%mm7,%%mm7\n\t" \ 1.61 + "packsswb %%mm5,%%mm4\n\t" \ 1.62 + "pxor %%mm6,%%mm6\n\t" \ 1.63 + "pxor %%mm7,%%mm4\n\t" \ 1.64 + "packuswb %%mm3,%%mm1\n\t" \ 1.65 + /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \ 1.66 + /*There's no unsigned byte+signed byte with unsigned saturation op code, so \ 1.67 + we have to split things by sign (the other option is to work in 16 bits, \ 1.68 + but working in 8 bits gives much better parallelism). \ 1.69 + We compute abs(R_i), but save a mask of which terms were negative in mm6. \ 1.70 + Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \ 1.71 + Finally, we split mm4 into positive and negative pieces using the mask in \ 1.72 + mm6, and add and subtract them as appropriate.*/ \ 1.73 + /*mm4=abs(-R_i)*/ \ 1.74 + /*mm7=255-2*L*/ \ 1.75 + "pcmpgtb %%mm4,%%mm6\n\t" \ 1.76 + "psubb %%mm0,%%mm7\n\t" \ 1.77 + "pxor %%mm6,%%mm4\n\t" \ 1.78 + "psubb %%mm0,%%mm7\n\t" \ 1.79 + "psubb %%mm6,%%mm4\n\t" \ 1.80 + /*mm7=255-max(2*L-abs(R_i),0)*/ \ 1.81 + "paddusb %%mm4,%%mm7\n\t" \ 1.82 + /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \ 1.83 + "paddusb %%mm7,%%mm4\n\t" \ 1.84 + "psubusb %%mm7,%%mm4\n\t" \ 1.85 + /*Now split mm4 by the original sign of -R_i.*/ \ 1.86 + "movq %%mm4,%%mm5\n\t" \ 1.87 + "pand %%mm6,%%mm4\n\t" \ 1.88 + "pandn %%mm5,%%mm6\n\t" \ 1.89 + /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \ 1.90 + /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \ 1.91 + "paddusb %%mm4,%%mm1\n\t" \ 1.92 + "psubusb %%mm4,%%mm2\n\t" \ 1.93 + "psubusb %%mm6,%%mm1\n\t" \ 1.94 + "paddusb %%mm6,%%mm2\n\t" \ 1.95 + 1.96 +/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}. 1.97 + On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and 1.98 + mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}. 1.99 + All other MMX registers are clobbered.*/ 1.100 +#define OC_LOOP_FILTER8_MMXEXT \ 1.101 + "#OC_LOOP_FILTER8_MMXEXT\n\t" \ 1.102 + /*R_i=(a_i-3*b_i+3*c_i-d_i+4>>3) has the range [-127,128], so we compute \ 1.103 + -R_i=(-a_i+3*b_i-3*c_i+d_i+3>>3) instead.*/ \ 1.104 + /*This first part is based on the transformation \ 1.105 + f = -(3*(c-b)+a-d+4>>3) \ 1.106 + = -(3*(c+255-b)+(a+255-d)+4-1020>>3) \ 1.107 + = -(3*(c+~b)+(a+~d)-1016>>3) \ 1.108 + = 127-(3*(c+~b)+(a+~d)>>3) \ 1.109 + = 128+~(3*(c+~b)+(a+~d)>>3) (mod 256). \ 1.110 + Although pavgb(a,b) = (a+b+1>>1) (biased up), we rely heavily on the \ 1.111 + fact that ~pavgb(~a,~b) = (a+b>>1) (biased down). \ 1.112 + Using this, the last expression above can be computed in 8 bits of working \ 1.113 + precision via: \ 1.114 + u = ~pavgb(~b,c); \ 1.115 + v = pavgb(b,~c); \ 1.116 + This mask is 0 or 0xFF, and controls whether t is biased up or down: \ 1.117 + m = u-v; \ 1.118 + t = m^pavgb(m^~a,m^d); \ 1.119 + f = 128+pavgb(pavgb(t,u),v); \ 1.120 + This required some careful analysis to ensure that carries are propagated \ 1.121 + correctly in all cases, but has been checked exhaustively.*/ \ 1.122 + /*input (a, b, c, d, ., ., ., .)*/ \ 1.123 + /*ff=0xFF; \ 1.124 + u=b; \ 1.125 + v=c; \ 1.126 + ll=255-2*L;*/ \ 1.127 + "pcmpeqb %%mm7,%%mm7\n\t" \ 1.128 + "movq %%mm1,%%mm4\n\t" \ 1.129 + "movq %%mm2,%%mm5\n\t" \ 1.130 + "movq (%[ll]),%%mm6\n\t" \ 1.131 + /*allocated u, v, ll, ff: (a, b, c, d, u, v, ll, ff)*/ \ 1.132 + /*u^=ff; \ 1.133 + v^=ff;*/ \ 1.134 + "pxor %%mm7,%%mm4\n\t" \ 1.135 + "pxor %%mm7,%%mm5\n\t" \ 1.136 + /*allocated ll: (a, b, c, d, u, v, ll, ff)*/ \ 1.137 + /*u=pavgb(u,c); \ 1.138 + v=pavgb(v,b);*/ \ 1.139 + "pavgb %%mm2,%%mm4\n\t" \ 1.140 + "pavgb %%mm1,%%mm5\n\t" \ 1.141 + /*u^=ff; \ 1.142 + a^=ff;*/ \ 1.143 + "pxor %%mm7,%%mm4\n\t" \ 1.144 + "pxor %%mm7,%%mm0\n\t" \ 1.145 + /*m=u-v;*/ \ 1.146 + "psubb %%mm5,%%mm4\n\t" \ 1.147 + /*freed u, allocated m: (a, b, c, d, m, v, ll, ff)*/ \ 1.148 + /*a^=m; \ 1.149 + d^=m;*/ \ 1.150 + "pxor %%mm4,%%mm0\n\t" \ 1.151 + "pxor %%mm4,%%mm3\n\t" \ 1.152 + /*t=pavgb(a,d);*/ \ 1.153 + "pavgb %%mm3,%%mm0\n\t" \ 1.154 + "psllw $7,%%mm7\n\t" \ 1.155 + /*freed a, d, ff, allocated t, of: (t, b, c, ., m, v, ll, of)*/ \ 1.156 + /*t^=m; \ 1.157 + u=m+v;*/ \ 1.158 + "pxor %%mm4,%%mm0\n\t" \ 1.159 + "paddb %%mm5,%%mm4\n\t" \ 1.160 + /*freed t, m, allocated f, u: (f, b, c, ., u, v, ll, of)*/ \ 1.161 + /*f=pavgb(f,u); \ 1.162 + of=128;*/ \ 1.163 + "pavgb %%mm4,%%mm0\n\t" \ 1.164 + "packsswb %%mm7,%%mm7\n\t" \ 1.165 + /*freed u, ff, allocated ll: (f, b, c, ., ll, v, ll, of)*/ \ 1.166 + /*f=pavgb(f,v);*/ \ 1.167 + "pavgb %%mm5,%%mm0\n\t" \ 1.168 + "movq %%mm7,%%mm3\n\t" \ 1.169 + "movq %%mm6,%%mm4\n\t" \ 1.170 + /*freed v, allocated of: (f, b, c, of, ll, ., ll, of)*/ \ 1.171 + /*Now compute lflim of R_i=-(128+mm0) cf. Section 7.10 of the sepc.*/ \ 1.172 + /*There's no unsigned byte+signed byte with unsigned saturation op code, so \ 1.173 + we have to split things by sign (the other option is to work in 16 bits, \ 1.174 + but staying in 8 bits gives much better parallelism).*/ \ 1.175 + /*Instead of adding the offset of 128 in mm3, we use it to split mm0. \ 1.176 + This is the same number of instructions as computing a mask and splitting \ 1.177 + after the lflim computation, but has shorter dependency chains.*/ \ 1.178 + /*mm0=R_i<0?-R_i:0 (denoted abs(R_i<0))\ 1.179 + mm3=R_i>0?R_i:0* (denoted abs(R_i>0))*/ \ 1.180 + "psubusb %%mm0,%%mm3\n\t" \ 1.181 + "psubusb %%mm7,%%mm0\n\t" \ 1.182 + /*mm6=255-max(2*L-abs(R_i<0),0) \ 1.183 + mm4=255-max(2*L-abs(R_i>0),0)*/ \ 1.184 + "paddusb %%mm3,%%mm4\n\t" \ 1.185 + "paddusb %%mm0,%%mm6\n\t" \ 1.186 + /*mm0=min(abs(R_i<0),max(2*L-abs(R_i<0),0)) \ 1.187 + mm3=min(abs(R_i>0),max(2*L-abs(R_i>0),0))*/ \ 1.188 + "paddusb %%mm4,%%mm3\n\t" \ 1.189 + "paddusb %%mm6,%%mm0\n\t" \ 1.190 + "psubusb %%mm4,%%mm3\n\t" \ 1.191 + "psubusb %%mm6,%%mm0\n\t" \ 1.192 + /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \ 1.193 + /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \ 1.194 + "paddusb %%mm3,%%mm1\n\t" \ 1.195 + "psubusb %%mm3,%%mm2\n\t" \ 1.196 + "psubusb %%mm0,%%mm1\n\t" \ 1.197 + "paddusb %%mm0,%%mm2\n\t" \ 1.198 + 1.199 +#define OC_LOOP_FILTER_V(_filter,_pix,_ystride,_ll) \ 1.200 + do{ \ 1.201 + ptrdiff_t ystride3__; \ 1.202 + __asm__ __volatile__( \ 1.203 + /*mm0={a0,...,a7}*/ \ 1.204 + "movq (%[pix]),%%mm0\n\t" \ 1.205 + /*ystride3=_ystride*3*/ \ 1.206 + "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \ 1.207 + /*mm3={d0,...,d7}*/ \ 1.208 + "movq (%[pix],%[ystride3]),%%mm3\n\t" \ 1.209 + /*mm1={b0,...,b7}*/ \ 1.210 + "movq (%[pix],%[ystride]),%%mm1\n\t" \ 1.211 + /*mm2={c0,...,c7}*/ \ 1.212 + "movq (%[pix],%[ystride],2),%%mm2\n\t" \ 1.213 + _filter \ 1.214 + /*Write it back out.*/ \ 1.215 + "movq %%mm1,(%[pix],%[ystride])\n\t" \ 1.216 + "movq %%mm2,(%[pix],%[ystride],2)\n\t" \ 1.217 + :[ystride3]"=&r"(ystride3__) \ 1.218 + :[pix]"r"(_pix-_ystride*2),[ystride]"r"((ptrdiff_t)(_ystride)), \ 1.219 + [ll]"r"(_ll) \ 1.220 + :"memory" \ 1.221 + ); \ 1.222 + } \ 1.223 + while(0) 1.224 + 1.225 +#define OC_LOOP_FILTER_H(_filter,_pix,_ystride,_ll) \ 1.226 + do{ \ 1.227 + unsigned char *pix__; \ 1.228 + ptrdiff_t ystride3__; \ 1.229 + ptrdiff_t d__; \ 1.230 + pix__=(_pix)-2; \ 1.231 + __asm__ __volatile__( \ 1.232 + /*x x x x d0 c0 b0 a0*/ \ 1.233 + "movd (%[pix]),%%mm0\n\t" \ 1.234 + /*x x x x d1 c1 b1 a1*/ \ 1.235 + "movd (%[pix],%[ystride]),%%mm1\n\t" \ 1.236 + /*ystride3=_ystride*3*/ \ 1.237 + "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \ 1.238 + /*x x x x d2 c2 b2 a2*/ \ 1.239 + "movd (%[pix],%[ystride],2),%%mm2\n\t" \ 1.240 + /*x x x x d3 c3 b3 a3*/ \ 1.241 + "lea (%[pix],%[ystride],4),%[d]\n\t" \ 1.242 + "movd (%[pix],%[ystride3]),%%mm3\n\t" \ 1.243 + /*x x x x d4 c4 b4 a4*/ \ 1.244 + "movd (%[d]),%%mm4\n\t" \ 1.245 + /*x x x x d5 c5 b5 a5*/ \ 1.246 + "movd (%[d],%[ystride]),%%mm5\n\t" \ 1.247 + /*x x x x d6 c6 b6 a6*/ \ 1.248 + "movd (%[d],%[ystride],2),%%mm6\n\t" \ 1.249 + /*x x x x d7 c7 b7 a7*/ \ 1.250 + "movd (%[d],%[ystride3]),%%mm7\n\t" \ 1.251 + /*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \ 1.252 + "punpcklbw %%mm1,%%mm0\n\t" \ 1.253 + /*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \ 1.254 + "punpcklbw %%mm3,%%mm2\n\t" \ 1.255 + /*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \ 1.256 + "movq %%mm0,%%mm3\n\t" \ 1.257 + /*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \ 1.258 + "punpcklwd %%mm2,%%mm0\n\t" \ 1.259 + /*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \ 1.260 + "punpckhwd %%mm2,%%mm3\n\t" \ 1.261 + /*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \ 1.262 + "movq %%mm0,%%mm1\n\t" \ 1.263 + /*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \ 1.264 + "punpcklbw %%mm5,%%mm4\n\t" \ 1.265 + /*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \ 1.266 + "punpcklbw %%mm7,%%mm6\n\t" \ 1.267 + /*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \ 1.268 + "movq %%mm4,%%mm5\n\t" \ 1.269 + /*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \ 1.270 + "punpcklwd %%mm6,%%mm4\n\t" \ 1.271 + /*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \ 1.272 + "punpckhwd %%mm6,%%mm5\n\t" \ 1.273 + /*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \ 1.274 + "movq %%mm3,%%mm2\n\t" \ 1.275 + /*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \ 1.276 + "punpckldq %%mm4,%%mm0\n\t" \ 1.277 + /*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \ 1.278 + "punpckhdq %%mm4,%%mm1\n\t" \ 1.279 + /*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \ 1.280 + "punpckldq %%mm5,%%mm2\n\t" \ 1.281 + /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \ 1.282 + "punpckhdq %%mm5,%%mm3\n\t" \ 1.283 + _filter \ 1.284 + /*mm2={b0+R_0'',...,b7+R_7''}*/ \ 1.285 + "movq %%mm1,%%mm0\n\t" \ 1.286 + /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \ 1.287 + "punpcklbw %%mm2,%%mm1\n\t" \ 1.288 + /*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \ 1.289 + "punpckhbw %%mm2,%%mm0\n\t" \ 1.290 + /*[d]=c1 b1 c0 b0*/ \ 1.291 + "movd %%mm1,%[d]\n\t" \ 1.292 + "movw %w[d],1(%[pix])\n\t" \ 1.293 + "psrlq $32,%%mm1\n\t" \ 1.294 + "shr $16,%[d]\n\t" \ 1.295 + "movw %w[d],1(%[pix],%[ystride])\n\t" \ 1.296 + /*[d]=c3 b3 c2 b2*/ \ 1.297 + "movd %%mm1,%[d]\n\t" \ 1.298 + "movw %w[d],1(%[pix],%[ystride],2)\n\t" \ 1.299 + "shr $16,%[d]\n\t" \ 1.300 + "movw %w[d],1(%[pix],%[ystride3])\n\t" \ 1.301 + "lea (%[pix],%[ystride],4),%[pix]\n\t" \ 1.302 + /*[d]=c5 b5 c4 b4*/ \ 1.303 + "movd %%mm0,%[d]\n\t" \ 1.304 + "movw %w[d],1(%[pix])\n\t" \ 1.305 + "psrlq $32,%%mm0\n\t" \ 1.306 + "shr $16,%[d]\n\t" \ 1.307 + "movw %w[d],1(%[pix],%[ystride])\n\t" \ 1.308 + /*[d]=c7 b7 c6 b6*/ \ 1.309 + "movd %%mm0,%[d]\n\t" \ 1.310 + "movw %w[d],1(%[pix],%[ystride],2)\n\t" \ 1.311 + "shr $16,%[d]\n\t" \ 1.312 + "movw %w[d],1(%[pix],%[ystride3])\n\t" \ 1.313 + :[pix]"+r"(pix__),[ystride3]"=&r"(ystride3__),[d]"=&r"(d__) \ 1.314 + :[ystride]"r"((ptrdiff_t)(_ystride)),[ll]"r"(_ll) \ 1.315 + :"memory" \ 1.316 + ); \ 1.317 + } \ 1.318 + while(0) 1.319 + 1.320 +# endif 1.321 +#endif