1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libtheora/lib/x86/mmxfrag.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,368 @@ 1.4 +/******************************************************************** 1.5 + * * 1.6 + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * 1.7 + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * 1.8 + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * 1.9 + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * 1.10 + * * 1.11 + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * 1.12 + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * 1.13 + * * 1.14 + ******************************************************************** 1.15 + 1.16 + function: 1.17 + last mod: $Id: mmxfrag.c 17410 2010-09-21 21:53:48Z tterribe $ 1.18 + 1.19 + ********************************************************************/ 1.20 + 1.21 +/*MMX acceleration of fragment reconstruction for motion compensation. 1.22 + Originally written by Rudolf Marek. 1.23 + Additional optimization by Nils Pipenbrinck. 1.24 + Note: Loops are unrolled for best performance. 1.25 + The iteration each instruction belongs to is marked in the comments as #i.*/ 1.26 +#include <stddef.h> 1.27 +#include "x86int.h" 1.28 + 1.29 +#if defined(OC_X86_ASM) 1.30 + 1.31 +/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes 1.32 + between rows.*/ 1.33 +# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \ 1.34 + do{ \ 1.35 + const unsigned char *src; \ 1.36 + unsigned char *dst; \ 1.37 + ptrdiff_t ystride3; \ 1.38 + src=(_src); \ 1.39 + dst=(_dst); \ 1.40 + __asm__ __volatile__( \ 1.41 + /*src+0*ystride*/ \ 1.42 + "movq (%[src]),%%mm0\n\t" \ 1.43 + /*src+1*ystride*/ \ 1.44 + "movq (%[src],%[ystride]),%%mm1\n\t" \ 1.45 + /*ystride3=ystride*3*/ \ 1.46 + "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \ 1.47 + /*src+2*ystride*/ \ 1.48 + "movq (%[src],%[ystride],2),%%mm2\n\t" \ 1.49 + /*src+3*ystride*/ \ 1.50 + "movq (%[src],%[ystride3]),%%mm3\n\t" \ 1.51 + /*dst+0*ystride*/ \ 1.52 + "movq %%mm0,(%[dst])\n\t" \ 1.53 + /*dst+1*ystride*/ \ 1.54 + "movq %%mm1,(%[dst],%[ystride])\n\t" \ 1.55 + /*Pointer to next 4.*/ \ 1.56 + "lea (%[src],%[ystride],4),%[src]\n\t" \ 1.57 + /*dst+2*ystride*/ \ 1.58 + "movq %%mm2,(%[dst],%[ystride],2)\n\t" \ 1.59 + /*dst+3*ystride*/ \ 1.60 + "movq %%mm3,(%[dst],%[ystride3])\n\t" \ 1.61 + /*Pointer to next 4.*/ \ 1.62 + "lea (%[dst],%[ystride],4),%[dst]\n\t" \ 1.63 + /*src+0*ystride*/ \ 1.64 + "movq (%[src]),%%mm0\n\t" \ 1.65 + /*src+1*ystride*/ \ 1.66 + "movq (%[src],%[ystride]),%%mm1\n\t" \ 1.67 + /*src+2*ystride*/ \ 1.68 + "movq (%[src],%[ystride],2),%%mm2\n\t" \ 1.69 + /*src+3*ystride*/ \ 1.70 + "movq (%[src],%[ystride3]),%%mm3\n\t" \ 1.71 + /*dst+0*ystride*/ \ 1.72 + "movq %%mm0,(%[dst])\n\t" \ 1.73 + /*dst+1*ystride*/ \ 1.74 + "movq %%mm1,(%[dst],%[ystride])\n\t" \ 1.75 + /*dst+2*ystride*/ \ 1.76 + "movq %%mm2,(%[dst],%[ystride],2)\n\t" \ 1.77 + /*dst+3*ystride*/ \ 1.78 + "movq %%mm3,(%[dst],%[ystride3])\n\t" \ 1.79 + :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \ 1.80 + :[ystride]"r"((ptrdiff_t)(_ystride)) \ 1.81 + :"memory" \ 1.82 + ); \ 1.83 + } \ 1.84 + while(0) 1.85 + 1.86 +/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes 1.87 + between rows.*/ 1.88 +void oc_frag_copy_mmx(unsigned char *_dst, 1.89 + const unsigned char *_src,int _ystride){ 1.90 + OC_FRAG_COPY_MMX(_dst,_src,_ystride); 1.91 +} 1.92 + 1.93 +/*Copies the fragments specified by the lists of fragment indices from one 1.94 + frame to another. 1.95 + _dst_frame: The reference frame to copy to. 1.96 + _src_frame: The reference frame to copy from. 1.97 + _ystride: The row stride of the reference frames. 1.98 + _fragis: A pointer to a list of fragment indices. 1.99 + _nfragis: The number of fragment indices to copy. 1.100 + _frag_buf_offs: The offsets of fragments in the reference frames.*/ 1.101 +void oc_frag_copy_list_mmx(unsigned char *_dst_frame, 1.102 + const unsigned char *_src_frame,int _ystride, 1.103 + const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){ 1.104 + ptrdiff_t fragii; 1.105 + for(fragii=0;fragii<_nfragis;fragii++){ 1.106 + ptrdiff_t frag_buf_off; 1.107 + frag_buf_off=_frag_buf_offs[_fragis[fragii]]; 1.108 + OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off, 1.109 + _src_frame+frag_buf_off,_ystride); 1.110 + } 1.111 +} 1.112 + 1.113 + 1.114 +void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, 1.115 + const ogg_int16_t *_residue){ 1.116 + __asm__ __volatile__( 1.117 + /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/ 1.118 + "pcmpeqw %%mm0,%%mm0\n\t" 1.119 + /*#0 Load low residue.*/ 1.120 + "movq 0*8(%[residue]),%%mm1\n\t" 1.121 + /*#0 Load high residue.*/ 1.122 + "movq 1*8(%[residue]),%%mm2\n\t" 1.123 + /*Set mm0 to 0x8000800080008000.*/ 1.124 + "psllw $15,%%mm0\n\t" 1.125 + /*#1 Load low residue.*/ 1.126 + "movq 2*8(%[residue]),%%mm3\n\t" 1.127 + /*#1 Load high residue.*/ 1.128 + "movq 3*8(%[residue]),%%mm4\n\t" 1.129 + /*Set mm0 to 0x0080008000800080.*/ 1.130 + "psrlw $8,%%mm0\n\t" 1.131 + /*#2 Load low residue.*/ 1.132 + "movq 4*8(%[residue]),%%mm5\n\t" 1.133 + /*#2 Load high residue.*/ 1.134 + "movq 5*8(%[residue]),%%mm6\n\t" 1.135 + /*#0 Bias low residue.*/ 1.136 + "paddsw %%mm0,%%mm1\n\t" 1.137 + /*#0 Bias high residue.*/ 1.138 + "paddsw %%mm0,%%mm2\n\t" 1.139 + /*#0 Pack to byte.*/ 1.140 + "packuswb %%mm2,%%mm1\n\t" 1.141 + /*#1 Bias low residue.*/ 1.142 + "paddsw %%mm0,%%mm3\n\t" 1.143 + /*#1 Bias high residue.*/ 1.144 + "paddsw %%mm0,%%mm4\n\t" 1.145 + /*#1 Pack to byte.*/ 1.146 + "packuswb %%mm4,%%mm3\n\t" 1.147 + /*#2 Bias low residue.*/ 1.148 + "paddsw %%mm0,%%mm5\n\t" 1.149 + /*#2 Bias high residue.*/ 1.150 + "paddsw %%mm0,%%mm6\n\t" 1.151 + /*#2 Pack to byte.*/ 1.152 + "packuswb %%mm6,%%mm5\n\t" 1.153 + /*#0 Write row.*/ 1.154 + "movq %%mm1,(%[dst])\n\t" 1.155 + /*#1 Write row.*/ 1.156 + "movq %%mm3,(%[dst],%[ystride])\n\t" 1.157 + /*#2 Write row.*/ 1.158 + "movq %%mm5,(%[dst],%[ystride],2)\n\t" 1.159 + /*#3 Load low residue.*/ 1.160 + "movq 6*8(%[residue]),%%mm1\n\t" 1.161 + /*#3 Load high residue.*/ 1.162 + "movq 7*8(%[residue]),%%mm2\n\t" 1.163 + /*#4 Load high residue.*/ 1.164 + "movq 8*8(%[residue]),%%mm3\n\t" 1.165 + /*#4 Load high residue.*/ 1.166 + "movq 9*8(%[residue]),%%mm4\n\t" 1.167 + /*#5 Load high residue.*/ 1.168 + "movq 10*8(%[residue]),%%mm5\n\t" 1.169 + /*#5 Load high residue.*/ 1.170 + "movq 11*8(%[residue]),%%mm6\n\t" 1.171 + /*#3 Bias low residue.*/ 1.172 + "paddsw %%mm0,%%mm1\n\t" 1.173 + /*#3 Bias high residue.*/ 1.174 + "paddsw %%mm0,%%mm2\n\t" 1.175 + /*#3 Pack to byte.*/ 1.176 + "packuswb %%mm2,%%mm1\n\t" 1.177 + /*#4 Bias low residue.*/ 1.178 + "paddsw %%mm0,%%mm3\n\t" 1.179 + /*#4 Bias high residue.*/ 1.180 + "paddsw %%mm0,%%mm4\n\t" 1.181 + /*#4 Pack to byte.*/ 1.182 + "packuswb %%mm4,%%mm3\n\t" 1.183 + /*#5 Bias low residue.*/ 1.184 + "paddsw %%mm0,%%mm5\n\t" 1.185 + /*#5 Bias high residue.*/ 1.186 + "paddsw %%mm0,%%mm6\n\t" 1.187 + /*#5 Pack to byte.*/ 1.188 + "packuswb %%mm6,%%mm5\n\t" 1.189 + /*#3 Write row.*/ 1.190 + "movq %%mm1,(%[dst],%[ystride3])\n\t" 1.191 + /*#4 Write row.*/ 1.192 + "movq %%mm3,(%[dst4])\n\t" 1.193 + /*#5 Write row.*/ 1.194 + "movq %%mm5,(%[dst4],%[ystride])\n\t" 1.195 + /*#6 Load low residue.*/ 1.196 + "movq 12*8(%[residue]),%%mm1\n\t" 1.197 + /*#6 Load high residue.*/ 1.198 + "movq 13*8(%[residue]),%%mm2\n\t" 1.199 + /*#7 Load low residue.*/ 1.200 + "movq 14*8(%[residue]),%%mm3\n\t" 1.201 + /*#7 Load high residue.*/ 1.202 + "movq 15*8(%[residue]),%%mm4\n\t" 1.203 + /*#6 Bias low residue.*/ 1.204 + "paddsw %%mm0,%%mm1\n\t" 1.205 + /*#6 Bias high residue.*/ 1.206 + "paddsw %%mm0,%%mm2\n\t" 1.207 + /*#6 Pack to byte.*/ 1.208 + "packuswb %%mm2,%%mm1\n\t" 1.209 + /*#7 Bias low residue.*/ 1.210 + "paddsw %%mm0,%%mm3\n\t" 1.211 + /*#7 Bias high residue.*/ 1.212 + "paddsw %%mm0,%%mm4\n\t" 1.213 + /*#7 Pack to byte.*/ 1.214 + "packuswb %%mm4,%%mm3\n\t" 1.215 + /*#6 Write row.*/ 1.216 + "movq %%mm1,(%[dst4],%[ystride],2)\n\t" 1.217 + /*#7 Write row.*/ 1.218 + "movq %%mm3,(%[dst4],%[ystride3])\n\t" 1.219 + : 1.220 + :[residue]"r"(_residue), 1.221 + [dst]"r"(_dst), 1.222 + [dst4]"r"(_dst+(_ystride<<2)), 1.223 + [ystride]"r"((ptrdiff_t)_ystride), 1.224 + [ystride3]"r"((ptrdiff_t)_ystride*3) 1.225 + :"memory" 1.226 + ); 1.227 +} 1.228 + 1.229 +void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src, 1.230 + int _ystride,const ogg_int16_t *_residue){ 1.231 + int i; 1.232 + /*Zero mm0.*/ 1.233 + __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::); 1.234 + for(i=4;i-->0;){ 1.235 + __asm__ __volatile__( 1.236 + /*#0 Load source.*/ 1.237 + "movq (%[src]),%%mm3\n\t" 1.238 + /*#1 Load source.*/ 1.239 + "movq (%[src],%[ystride]),%%mm7\n\t" 1.240 + /*#0 Get copy of src.*/ 1.241 + "movq %%mm3,%%mm4\n\t" 1.242 + /*#0 Expand high source.*/ 1.243 + "punpckhbw %%mm0,%%mm4\n\t" 1.244 + /*#0 Expand low source.*/ 1.245 + "punpcklbw %%mm0,%%mm3\n\t" 1.246 + /*#0 Add residue high.*/ 1.247 + "paddsw 8(%[residue]),%%mm4\n\t" 1.248 + /*#1 Get copy of src.*/ 1.249 + "movq %%mm7,%%mm2\n\t" 1.250 + /*#0 Add residue low.*/ 1.251 + "paddsw (%[residue]), %%mm3\n\t" 1.252 + /*#1 Expand high source.*/ 1.253 + "punpckhbw %%mm0,%%mm2\n\t" 1.254 + /*#0 Pack final row pixels.*/ 1.255 + "packuswb %%mm4,%%mm3\n\t" 1.256 + /*#1 Expand low source.*/ 1.257 + "punpcklbw %%mm0,%%mm7\n\t" 1.258 + /*#1 Add residue low.*/ 1.259 + "paddsw 16(%[residue]),%%mm7\n\t" 1.260 + /*#1 Add residue high.*/ 1.261 + "paddsw 24(%[residue]),%%mm2\n\t" 1.262 + /*Advance residue.*/ 1.263 + "lea 32(%[residue]),%[residue]\n\t" 1.264 + /*#1 Pack final row pixels.*/ 1.265 + "packuswb %%mm2,%%mm7\n\t" 1.266 + /*Advance src.*/ 1.267 + "lea (%[src],%[ystride],2),%[src]\n\t" 1.268 + /*#0 Write row.*/ 1.269 + "movq %%mm3,(%[dst])\n\t" 1.270 + /*#1 Write row.*/ 1.271 + "movq %%mm7,(%[dst],%[ystride])\n\t" 1.272 + /*Advance dst.*/ 1.273 + "lea (%[dst],%[ystride],2),%[dst]\n\t" 1.274 + :[residue]"+r"(_residue),[dst]"+r"(_dst),[src]"+r"(_src) 1.275 + :[ystride]"r"((ptrdiff_t)_ystride) 1.276 + :"memory" 1.277 + ); 1.278 + } 1.279 +} 1.280 + 1.281 +void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1, 1.282 + const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){ 1.283 + int i; 1.284 + /*Zero mm7.*/ 1.285 + __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::); 1.286 + for(i=4;i-->0;){ 1.287 + __asm__ __volatile__( 1.288 + /*#0 Load src1.*/ 1.289 + "movq (%[src1]),%%mm0\n\t" 1.290 + /*#0 Load src2.*/ 1.291 + "movq (%[src2]),%%mm2\n\t" 1.292 + /*#0 Copy src1.*/ 1.293 + "movq %%mm0,%%mm1\n\t" 1.294 + /*#0 Copy src2.*/ 1.295 + "movq %%mm2,%%mm3\n\t" 1.296 + /*#1 Load src1.*/ 1.297 + "movq (%[src1],%[ystride]),%%mm4\n\t" 1.298 + /*#0 Unpack lower src1.*/ 1.299 + "punpcklbw %%mm7,%%mm0\n\t" 1.300 + /*#1 Load src2.*/ 1.301 + "movq (%[src2],%[ystride]),%%mm5\n\t" 1.302 + /*#0 Unpack higher src1.*/ 1.303 + "punpckhbw %%mm7,%%mm1\n\t" 1.304 + /*#0 Unpack lower src2.*/ 1.305 + "punpcklbw %%mm7,%%mm2\n\t" 1.306 + /*#0 Unpack higher src2.*/ 1.307 + "punpckhbw %%mm7,%%mm3\n\t" 1.308 + /*Advance src1 ptr.*/ 1.309 + "lea (%[src1],%[ystride],2),%[src1]\n\t" 1.310 + /*Advance src2 ptr.*/ 1.311 + "lea (%[src2],%[ystride],2),%[src2]\n\t" 1.312 + /*#0 Lower src1+src2.*/ 1.313 + "paddsw %%mm2,%%mm0\n\t" 1.314 + /*#0 Higher src1+src2.*/ 1.315 + "paddsw %%mm3,%%mm1\n\t" 1.316 + /*#1 Copy src1.*/ 1.317 + "movq %%mm4,%%mm2\n\t" 1.318 + /*#0 Build lo average.*/ 1.319 + "psraw $1,%%mm0\n\t" 1.320 + /*#1 Copy src2.*/ 1.321 + "movq %%mm5,%%mm3\n\t" 1.322 + /*#1 Unpack lower src1.*/ 1.323 + "punpcklbw %%mm7,%%mm4\n\t" 1.324 + /*#0 Build hi average.*/ 1.325 + "psraw $1,%%mm1\n\t" 1.326 + /*#1 Unpack higher src1.*/ 1.327 + "punpckhbw %%mm7,%%mm2\n\t" 1.328 + /*#0 low+=residue.*/ 1.329 + "paddsw (%[residue]),%%mm0\n\t" 1.330 + /*#1 Unpack lower src2.*/ 1.331 + "punpcklbw %%mm7,%%mm5\n\t" 1.332 + /*#0 high+=residue.*/ 1.333 + "paddsw 8(%[residue]),%%mm1\n\t" 1.334 + /*#1 Unpack higher src2.*/ 1.335 + "punpckhbw %%mm7,%%mm3\n\t" 1.336 + /*#1 Lower src1+src2.*/ 1.337 + "paddsw %%mm4,%%mm5\n\t" 1.338 + /*#0 Pack and saturate.*/ 1.339 + "packuswb %%mm1,%%mm0\n\t" 1.340 + /*#1 Higher src1+src2.*/ 1.341 + "paddsw %%mm2,%%mm3\n\t" 1.342 + /*#0 Write row.*/ 1.343 + "movq %%mm0,(%[dst])\n\t" 1.344 + /*#1 Build lo average.*/ 1.345 + "psraw $1,%%mm5\n\t" 1.346 + /*#1 Build hi average.*/ 1.347 + "psraw $1,%%mm3\n\t" 1.348 + /*#1 low+=residue.*/ 1.349 + "paddsw 16(%[residue]),%%mm5\n\t" 1.350 + /*#1 high+=residue.*/ 1.351 + "paddsw 24(%[residue]),%%mm3\n\t" 1.352 + /*#1 Pack and saturate.*/ 1.353 + "packuswb %%mm3,%%mm5\n\t" 1.354 + /*#1 Write row ptr.*/ 1.355 + "movq %%mm5,(%[dst],%[ystride])\n\t" 1.356 + /*Advance residue ptr.*/ 1.357 + "add $32,%[residue]\n\t" 1.358 + /*Advance dest ptr.*/ 1.359 + "lea (%[dst],%[ystride],2),%[dst]\n\t" 1.360 + :[dst]"+r"(_dst),[residue]"+r"(_residue), 1.361 + [src1]"+%r"(_src1),[src2]"+r"(_src2) 1.362 + :[ystride]"r"((ptrdiff_t)_ystride) 1.363 + :"memory" 1.364 + ); 1.365 + } 1.366 +} 1.367 + 1.368 +void oc_restore_fpu_mmx(void){ 1.369 + __asm__ __volatile__("emms\n\t"); 1.370 +} 1.371 +#endif