1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libtheora/lib/x86_vc/mmxfrag.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,416 @@ 1.4 +/******************************************************************** 1.5 + * * 1.6 + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * 1.7 + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * 1.8 + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * 1.9 + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * 1.10 + * * 1.11 + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * 1.12 + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * 1.13 + * * 1.14 + ******************************************************************** 1.15 + 1.16 + function: 1.17 + last mod: $Id: mmxfrag.c 17446 2010-09-23 20:06:20Z tterribe $ 1.18 + 1.19 + ********************************************************************/ 1.20 + 1.21 +/*MMX acceleration of fragment reconstruction for motion compensation. 1.22 + Originally written by Rudolf Marek. 1.23 + Additional optimization by Nils Pipenbrinck. 1.24 + Note: Loops are unrolled for best performance. 1.25 + The iteration each instruction belongs to is marked in the comments as #i.*/ 1.26 +#include <stddef.h> 1.27 +#include "x86int.h" 1.28 + 1.29 +#if defined(OC_X86_ASM) 1.30 + 1.31 +/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes 1.32 + between rows.*/ 1.33 +# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \ 1.34 + do{ \ 1.35 + const unsigned char *src; \ 1.36 + unsigned char *dst; \ 1.37 + src=(_src); \ 1.38 + dst=(_dst); \ 1.39 + __asm mov SRC,src \ 1.40 + __asm mov DST,dst \ 1.41 + __asm mov YSTRIDE,_ystride \ 1.42 + /*src+0*ystride*/ \ 1.43 + __asm movq mm0,[SRC] \ 1.44 + /*src+1*ystride*/ \ 1.45 + __asm movq mm1,[SRC+YSTRIDE] \ 1.46 + /*ystride3=ystride*3*/ \ 1.47 + __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \ 1.48 + /*src+2*ystride*/ \ 1.49 + __asm movq mm2,[SRC+YSTRIDE*2] \ 1.50 + /*src+3*ystride*/ \ 1.51 + __asm movq mm3,[SRC+YSTRIDE3] \ 1.52 + /*dst+0*ystride*/ \ 1.53 + __asm movq [DST],mm0 \ 1.54 + /*dst+1*ystride*/ \ 1.55 + __asm movq [DST+YSTRIDE],mm1 \ 1.56 + /*Pointer to next 4.*/ \ 1.57 + __asm lea SRC,[SRC+YSTRIDE*4] \ 1.58 + /*dst+2*ystride*/ \ 1.59 + __asm movq [DST+YSTRIDE*2],mm2 \ 1.60 + /*dst+3*ystride*/ \ 1.61 + __asm movq [DST+YSTRIDE3],mm3 \ 1.62 + /*Pointer to next 4.*/ \ 1.63 + __asm lea DST,[DST+YSTRIDE*4] \ 1.64 + /*src+0*ystride*/ \ 1.65 + __asm movq mm0,[SRC] \ 1.66 + /*src+1*ystride*/ \ 1.67 + __asm movq mm1,[SRC+YSTRIDE] \ 1.68 + /*src+2*ystride*/ \ 1.69 + __asm movq mm2,[SRC+YSTRIDE*2] \ 1.70 + /*src+3*ystride*/ \ 1.71 + __asm movq mm3,[SRC+YSTRIDE3] \ 1.72 + /*dst+0*ystride*/ \ 1.73 + __asm movq [DST],mm0 \ 1.74 + /*dst+1*ystride*/ \ 1.75 + __asm movq [DST+YSTRIDE],mm1 \ 1.76 + /*dst+2*ystride*/ \ 1.77 + __asm movq [DST+YSTRIDE*2],mm2 \ 1.78 + /*dst+3*ystride*/ \ 1.79 + __asm movq [DST+YSTRIDE3],mm3 \ 1.80 + } \ 1.81 + while(0) 1.82 + 1.83 +/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes 1.84 + between rows.*/ 1.85 +void oc_frag_copy_mmx(unsigned char *_dst, 1.86 + const unsigned char *_src,int _ystride){ 1.87 +#define SRC edx 1.88 +#define DST eax 1.89 +#define YSTRIDE ecx 1.90 +#define YSTRIDE3 esi 1.91 + OC_FRAG_COPY_MMX(_dst,_src,_ystride); 1.92 +#undef SRC 1.93 +#undef DST 1.94 +#undef YSTRIDE 1.95 +#undef YSTRIDE3 1.96 +} 1.97 + 1.98 +/*Copies the fragments specified by the lists of fragment indices from one 1.99 + frame to another. 1.100 + _dst_frame: The reference frame to copy to. 1.101 + _src_frame: The reference frame to copy from. 1.102 + _ystride: The row stride of the reference frames. 1.103 + _fragis: A pointer to a list of fragment indices. 1.104 + _nfragis: The number of fragment indices to copy. 1.105 + _frag_buf_offs: The offsets of fragments in the reference frames.*/ 1.106 +void oc_frag_copy_list_mmx(unsigned char *_dst_frame, 1.107 + const unsigned char *_src_frame,int _ystride, 1.108 + const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){ 1.109 + ptrdiff_t fragii; 1.110 + for(fragii=0;fragii<_nfragis;fragii++){ 1.111 + ptrdiff_t frag_buf_off; 1.112 + frag_buf_off=_frag_buf_offs[_fragis[fragii]]; 1.113 +#define SRC edx 1.114 +#define DST eax 1.115 +#define YSTRIDE ecx 1.116 +#define YSTRIDE3 edi 1.117 + OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off, 1.118 + _src_frame+frag_buf_off,_ystride); 1.119 +#undef SRC 1.120 +#undef DST 1.121 +#undef YSTRIDE 1.122 +#undef YSTRIDE3 1.123 + } 1.124 +} 1.125 + 1.126 +void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, 1.127 + const ogg_int16_t *_residue){ 1.128 + __asm{ 1.129 +#define DST edx 1.130 +#define DST4 esi 1.131 +#define YSTRIDE eax 1.132 +#define YSTRIDE3 edi 1.133 +#define RESIDUE ecx 1.134 + mov DST,_dst 1.135 + mov YSTRIDE,_ystride 1.136 + mov RESIDUE,_residue 1.137 + lea DST4,[DST+YSTRIDE*4] 1.138 + lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] 1.139 + /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/ 1.140 + pcmpeqw mm0,mm0 1.141 + /*#0 Load low residue.*/ 1.142 + movq mm1,[0*8+RESIDUE] 1.143 + /*#0 Load high residue.*/ 1.144 + movq mm2,[1*8+RESIDUE] 1.145 + /*Set mm0 to 0x8000800080008000.*/ 1.146 + psllw mm0,15 1.147 + /*#1 Load low residue.*/ 1.148 + movq mm3,[2*8+RESIDUE] 1.149 + /*#1 Load high residue.*/ 1.150 + movq mm4,[3*8+RESIDUE] 1.151 + /*Set mm0 to 0x0080008000800080.*/ 1.152 + psrlw mm0,8 1.153 + /*#2 Load low residue.*/ 1.154 + movq mm5,[4*8+RESIDUE] 1.155 + /*#2 Load high residue.*/ 1.156 + movq mm6,[5*8+RESIDUE] 1.157 + /*#0 Bias low residue.*/ 1.158 + paddsw mm1,mm0 1.159 + /*#0 Bias high residue.*/ 1.160 + paddsw mm2,mm0 1.161 + /*#0 Pack to byte.*/ 1.162 + packuswb mm1,mm2 1.163 + /*#1 Bias low residue.*/ 1.164 + paddsw mm3,mm0 1.165 + /*#1 Bias high residue.*/ 1.166 + paddsw mm4,mm0 1.167 + /*#1 Pack to byte.*/ 1.168 + packuswb mm3,mm4 1.169 + /*#2 Bias low residue.*/ 1.170 + paddsw mm5,mm0 1.171 + /*#2 Bias high residue.*/ 1.172 + paddsw mm6,mm0 1.173 + /*#2 Pack to byte.*/ 1.174 + packuswb mm5,mm6 1.175 + /*#0 Write row.*/ 1.176 + movq [DST],mm1 1.177 + /*#1 Write row.*/ 1.178 + movq [DST+YSTRIDE],mm3 1.179 + /*#2 Write row.*/ 1.180 + movq [DST+YSTRIDE*2],mm5 1.181 + /*#3 Load low residue.*/ 1.182 + movq mm1,[6*8+RESIDUE] 1.183 + /*#3 Load high residue.*/ 1.184 + movq mm2,[7*8+RESIDUE] 1.185 + /*#4 Load high residue.*/ 1.186 + movq mm3,[8*8+RESIDUE] 1.187 + /*#4 Load high residue.*/ 1.188 + movq mm4,[9*8+RESIDUE] 1.189 + /*#5 Load high residue.*/ 1.190 + movq mm5,[10*8+RESIDUE] 1.191 + /*#5 Load high residue.*/ 1.192 + movq mm6,[11*8+RESIDUE] 1.193 + /*#3 Bias low residue.*/ 1.194 + paddsw mm1,mm0 1.195 + /*#3 Bias high residue.*/ 1.196 + paddsw mm2,mm0 1.197 + /*#3 Pack to byte.*/ 1.198 + packuswb mm1,mm2 1.199 + /*#4 Bias low residue.*/ 1.200 + paddsw mm3,mm0 1.201 + /*#4 Bias high residue.*/ 1.202 + paddsw mm4,mm0 1.203 + /*#4 Pack to byte.*/ 1.204 + packuswb mm3,mm4 1.205 + /*#5 Bias low residue.*/ 1.206 + paddsw mm5,mm0 1.207 + /*#5 Bias high residue.*/ 1.208 + paddsw mm6,mm0 1.209 + /*#5 Pack to byte.*/ 1.210 + packuswb mm5,mm6 1.211 + /*#3 Write row.*/ 1.212 + movq [DST+YSTRIDE3],mm1 1.213 + /*#4 Write row.*/ 1.214 + movq [DST4],mm3 1.215 + /*#5 Write row.*/ 1.216 + movq [DST4+YSTRIDE],mm5 1.217 + /*#6 Load low residue.*/ 1.218 + movq mm1,[12*8+RESIDUE] 1.219 + /*#6 Load high residue.*/ 1.220 + movq mm2,[13*8+RESIDUE] 1.221 + /*#7 Load low residue.*/ 1.222 + movq mm3,[14*8+RESIDUE] 1.223 + /*#7 Load high residue.*/ 1.224 + movq mm4,[15*8+RESIDUE] 1.225 + /*#6 Bias low residue.*/ 1.226 + paddsw mm1,mm0 1.227 + /*#6 Bias high residue.*/ 1.228 + paddsw mm2,mm0 1.229 + /*#6 Pack to byte.*/ 1.230 + packuswb mm1,mm2 1.231 + /*#7 Bias low residue.*/ 1.232 + paddsw mm3,mm0 1.233 + /*#7 Bias high residue.*/ 1.234 + paddsw mm4,mm0 1.235 + /*#7 Pack to byte.*/ 1.236 + packuswb mm3,mm4 1.237 + /*#6 Write row.*/ 1.238 + movq [DST4+YSTRIDE*2],mm1 1.239 + /*#7 Write row.*/ 1.240 + movq [DST4+YSTRIDE3],mm3 1.241 +#undef DST 1.242 +#undef DST4 1.243 +#undef YSTRIDE 1.244 +#undef YSTRIDE3 1.245 +#undef RESIDUE 1.246 + } 1.247 +} 1.248 + 1.249 +void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src, 1.250 + int _ystride,const ogg_int16_t *_residue){ 1.251 + int i; 1.252 + /*Zero mm0.*/ 1.253 + __asm pxor mm0,mm0; 1.254 + for(i=4;i-->0;){ 1.255 + __asm{ 1.256 +#define DST edx 1.257 +#define SRC ecx 1.258 +#define YSTRIDE edi 1.259 +#define RESIDUE eax 1.260 + mov DST,_dst 1.261 + mov SRC,_src 1.262 + mov YSTRIDE,_ystride 1.263 + mov RESIDUE,_residue 1.264 + /*#0 Load source.*/ 1.265 + movq mm3,[SRC] 1.266 + /*#1 Load source.*/ 1.267 + movq mm7,[SRC+YSTRIDE] 1.268 + /*#0 Get copy of src.*/ 1.269 + movq mm4,mm3 1.270 + /*#0 Expand high source.*/ 1.271 + punpckhbw mm4,mm0 1.272 + /*#0 Expand low source.*/ 1.273 + punpcklbw mm3,mm0 1.274 + /*#0 Add residue high.*/ 1.275 + paddsw mm4,[8+RESIDUE] 1.276 + /*#1 Get copy of src.*/ 1.277 + movq mm2,mm7 1.278 + /*#0 Add residue low.*/ 1.279 + paddsw mm3,[RESIDUE] 1.280 + /*#1 Expand high source.*/ 1.281 + punpckhbw mm2,mm0 1.282 + /*#0 Pack final row pixels.*/ 1.283 + packuswb mm3,mm4 1.284 + /*#1 Expand low source.*/ 1.285 + punpcklbw mm7,mm0 1.286 + /*#1 Add residue low.*/ 1.287 + paddsw mm7,[16+RESIDUE] 1.288 + /*#1 Add residue high.*/ 1.289 + paddsw mm2,[24+RESIDUE] 1.290 + /*Advance residue.*/ 1.291 + lea RESIDUE,[32+RESIDUE] 1.292 + /*#1 Pack final row pixels.*/ 1.293 + packuswb mm7,mm2 1.294 + /*Advance src.*/ 1.295 + lea SRC,[SRC+YSTRIDE*2] 1.296 + /*#0 Write row.*/ 1.297 + movq [DST],mm3 1.298 + /*#1 Write row.*/ 1.299 + movq [DST+YSTRIDE],mm7 1.300 + /*Advance dst.*/ 1.301 + lea DST,[DST+YSTRIDE*2] 1.302 + mov _residue,RESIDUE 1.303 + mov _dst,DST 1.304 + mov _src,SRC 1.305 +#undef DST 1.306 +#undef SRC 1.307 +#undef YSTRIDE 1.308 +#undef RESIDUE 1.309 + } 1.310 + } 1.311 +} 1.312 + 1.313 +void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1, 1.314 + const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){ 1.315 + int i; 1.316 + /*Zero mm7.*/ 1.317 + __asm pxor mm7,mm7; 1.318 + for(i=4;i-->0;){ 1.319 + __asm{ 1.320 +#define SRC1 ecx 1.321 +#define SRC2 edi 1.322 +#define YSTRIDE esi 1.323 +#define RESIDUE edx 1.324 +#define DST eax 1.325 + mov YSTRIDE,_ystride 1.326 + mov DST,_dst 1.327 + mov RESIDUE,_residue 1.328 + mov SRC1,_src1 1.329 + mov SRC2,_src2 1.330 + /*#0 Load src1.*/ 1.331 + movq mm0,[SRC1] 1.332 + /*#0 Load src2.*/ 1.333 + movq mm2,[SRC2] 1.334 + /*#0 Copy src1.*/ 1.335 + movq mm1,mm0 1.336 + /*#0 Copy src2.*/ 1.337 + movq mm3,mm2 1.338 + /*#1 Load src1.*/ 1.339 + movq mm4,[SRC1+YSTRIDE] 1.340 + /*#0 Unpack lower src1.*/ 1.341 + punpcklbw mm0,mm7 1.342 + /*#1 Load src2.*/ 1.343 + movq mm5,[SRC2+YSTRIDE] 1.344 + /*#0 Unpack higher src1.*/ 1.345 + punpckhbw mm1,mm7 1.346 + /*#0 Unpack lower src2.*/ 1.347 + punpcklbw mm2,mm7 1.348 + /*#0 Unpack higher src2.*/ 1.349 + punpckhbw mm3,mm7 1.350 + /*Advance src1 ptr.*/ 1.351 + lea SRC1,[SRC1+YSTRIDE*2] 1.352 + /*Advance src2 ptr.*/ 1.353 + lea SRC2,[SRC2+YSTRIDE*2] 1.354 + /*#0 Lower src1+src2.*/ 1.355 + paddsw mm0,mm2 1.356 + /*#0 Higher src1+src2.*/ 1.357 + paddsw mm1,mm3 1.358 + /*#1 Copy src1.*/ 1.359 + movq mm2,mm4 1.360 + /*#0 Build lo average.*/ 1.361 + psraw mm0,1 1.362 + /*#1 Copy src2.*/ 1.363 + movq mm3,mm5 1.364 + /*#1 Unpack lower src1.*/ 1.365 + punpcklbw mm4,mm7 1.366 + /*#0 Build hi average.*/ 1.367 + psraw mm1,1 1.368 + /*#1 Unpack higher src1.*/ 1.369 + punpckhbw mm2,mm7 1.370 + /*#0 low+=residue.*/ 1.371 + paddsw mm0,[RESIDUE] 1.372 + /*#1 Unpack lower src2.*/ 1.373 + punpcklbw mm5,mm7 1.374 + /*#0 high+=residue.*/ 1.375 + paddsw mm1,[8+RESIDUE] 1.376 + /*#1 Unpack higher src2.*/ 1.377 + punpckhbw mm3,mm7 1.378 + /*#1 Lower src1+src2.*/ 1.379 + paddsw mm5,mm4 1.380 + /*#0 Pack and saturate.*/ 1.381 + packuswb mm0,mm1 1.382 + /*#1 Higher src1+src2.*/ 1.383 + paddsw mm3,mm2 1.384 + /*#0 Write row.*/ 1.385 + movq [DST],mm0 1.386 + /*#1 Build lo average.*/ 1.387 + psraw mm5,1 1.388 + /*#1 Build hi average.*/ 1.389 + psraw mm3,1 1.390 + /*#1 low+=residue.*/ 1.391 + paddsw mm5,[16+RESIDUE] 1.392 + /*#1 high+=residue.*/ 1.393 + paddsw mm3,[24+RESIDUE] 1.394 + /*#1 Pack and saturate.*/ 1.395 + packuswb mm5,mm3 1.396 + /*#1 Write row ptr.*/ 1.397 + movq [DST+YSTRIDE],mm5 1.398 + /*Advance residue ptr.*/ 1.399 + add RESIDUE,32 1.400 + /*Advance dest ptr.*/ 1.401 + lea DST,[DST+YSTRIDE*2] 1.402 + mov _dst,DST 1.403 + mov _residue,RESIDUE 1.404 + mov _src1,SRC1 1.405 + mov _src2,SRC2 1.406 +#undef SRC1 1.407 +#undef SRC2 1.408 +#undef YSTRIDE 1.409 +#undef RESIDUE 1.410 +#undef DST 1.411 + } 1.412 + } 1.413 +} 1.414 + 1.415 +void oc_restore_fpu_mmx(void){ 1.416 + __asm emms; 1.417 +} 1.418 + 1.419 +#endif