media/libtheora/lib/x86_vc/mmxfrag.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libtheora/lib/x86_vc/mmxfrag.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,416 @@
     1.4 +/********************************************************************
     1.5 + *                                                                  *
     1.6 + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
     1.7 + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
     1.8 + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
     1.9 + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
    1.10 + *                                                                  *
    1.11 + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
    1.12 + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
    1.13 + *                                                                  *
    1.14 + ********************************************************************
    1.15 +
    1.16 +  function:
    1.17 +    last mod: $Id: mmxfrag.c 17446 2010-09-23 20:06:20Z tterribe $
    1.18 +
    1.19 + ********************************************************************/
    1.20 +
    1.21 +/*MMX acceleration of fragment reconstruction for motion compensation.
    1.22 +  Originally written by Rudolf Marek.
    1.23 +  Additional optimization by Nils Pipenbrinck.
    1.24 +  Note: Loops are unrolled for best performance.
    1.25 +  The iteration each instruction belongs to is marked in the comments as #i.*/
    1.26 +#include <stddef.h>
    1.27 +#include "x86int.h"
    1.28 +
    1.29 +#if defined(OC_X86_ASM)
    1.30 +
    1.31 +/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
    1.32 +   between rows.*/
    1.33 +# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
    1.34 +  do{ \
    1.35 +    const unsigned char *src; \
    1.36 +    unsigned char       *dst; \
    1.37 +    src=(_src); \
    1.38 +    dst=(_dst); \
    1.39 +    __asm  mov SRC,src \
    1.40 +    __asm  mov DST,dst \
    1.41 +    __asm  mov YSTRIDE,_ystride \
    1.42 +    /*src+0*ystride*/ \
    1.43 +    __asm  movq mm0,[SRC] \
    1.44 +    /*src+1*ystride*/ \
    1.45 +    __asm  movq mm1,[SRC+YSTRIDE] \
    1.46 +    /*ystride3=ystride*3*/ \
    1.47 +    __asm  lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
    1.48 +    /*src+2*ystride*/ \
    1.49 +    __asm  movq mm2,[SRC+YSTRIDE*2] \
    1.50 +    /*src+3*ystride*/ \
    1.51 +    __asm  movq mm3,[SRC+YSTRIDE3] \
    1.52 +    /*dst+0*ystride*/ \
    1.53 +    __asm  movq [DST],mm0 \
    1.54 +    /*dst+1*ystride*/ \
    1.55 +    __asm  movq [DST+YSTRIDE],mm1 \
    1.56 +    /*Pointer to next 4.*/ \
    1.57 +    __asm  lea SRC,[SRC+YSTRIDE*4] \
    1.58 +    /*dst+2*ystride*/ \
    1.59 +    __asm  movq [DST+YSTRIDE*2],mm2 \
    1.60 +    /*dst+3*ystride*/ \
    1.61 +    __asm  movq [DST+YSTRIDE3],mm3 \
    1.62 +    /*Pointer to next 4.*/ \
    1.63 +    __asm  lea DST,[DST+YSTRIDE*4] \
    1.64 +    /*src+0*ystride*/ \
    1.65 +    __asm  movq mm0,[SRC] \
    1.66 +    /*src+1*ystride*/ \
    1.67 +    __asm  movq mm1,[SRC+YSTRIDE] \
    1.68 +    /*src+2*ystride*/ \
    1.69 +    __asm  movq mm2,[SRC+YSTRIDE*2] \
    1.70 +    /*src+3*ystride*/ \
    1.71 +    __asm  movq mm3,[SRC+YSTRIDE3] \
    1.72 +    /*dst+0*ystride*/ \
    1.73 +    __asm  movq [DST],mm0 \
    1.74 +    /*dst+1*ystride*/ \
    1.75 +    __asm  movq [DST+YSTRIDE],mm1 \
    1.76 +    /*dst+2*ystride*/ \
    1.77 +    __asm  movq [DST+YSTRIDE*2],mm2 \
    1.78 +    /*dst+3*ystride*/ \
    1.79 +    __asm  movq [DST+YSTRIDE3],mm3 \
    1.80 +  } \
    1.81 +  while(0)
    1.82 +
    1.83 +/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
    1.84 +   between rows.*/
    1.85 +void oc_frag_copy_mmx(unsigned char *_dst,
    1.86 + const unsigned char *_src,int _ystride){
    1.87 +#define SRC edx
    1.88 +#define DST eax
    1.89 +#define YSTRIDE ecx
    1.90 +#define YSTRIDE3 esi
    1.91 +  OC_FRAG_COPY_MMX(_dst,_src,_ystride);
    1.92 +#undef SRC
    1.93 +#undef DST
    1.94 +#undef YSTRIDE
    1.95 +#undef YSTRIDE3
    1.96 +}
    1.97 +
    1.98 +/*Copies the fragments specified by the lists of fragment indices from one
    1.99 +   frame to another.
   1.100 +  _dst_frame:     The reference frame to copy to.
   1.101 +  _src_frame:     The reference frame to copy from.
   1.102 +  _ystride:       The row stride of the reference frames.
   1.103 +  _fragis:        A pointer to a list of fragment indices.
   1.104 +  _nfragis:       The number of fragment indices to copy.
   1.105 +  _frag_buf_offs: The offsets of fragments in the reference frames.*/
   1.106 +void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
   1.107 + const unsigned char *_src_frame,int _ystride,
   1.108 + const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
   1.109 +  ptrdiff_t fragii;
   1.110 +  for(fragii=0;fragii<_nfragis;fragii++){
   1.111 +    ptrdiff_t frag_buf_off;
   1.112 +    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
   1.113 +#define SRC edx
   1.114 +#define DST eax
   1.115 +#define YSTRIDE ecx
   1.116 +#define YSTRIDE3 edi
   1.117 +    OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
   1.118 +     _src_frame+frag_buf_off,_ystride);
   1.119 +#undef SRC
   1.120 +#undef DST
   1.121 +#undef YSTRIDE
   1.122 +#undef YSTRIDE3
   1.123 +  }
   1.124 +}
   1.125 +
   1.126 +void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
   1.127 + const ogg_int16_t *_residue){
   1.128 +  __asm{
   1.129 +#define DST edx
   1.130 +#define DST4 esi
   1.131 +#define YSTRIDE eax
   1.132 +#define YSTRIDE3 edi
   1.133 +#define RESIDUE ecx
   1.134 +    mov DST,_dst
   1.135 +    mov YSTRIDE,_ystride
   1.136 +    mov RESIDUE,_residue
   1.137 +    lea DST4,[DST+YSTRIDE*4]
   1.138 +    lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
   1.139 +    /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
   1.140 +    pcmpeqw mm0,mm0
   1.141 +    /*#0 Load low residue.*/
   1.142 +    movq mm1,[0*8+RESIDUE]
   1.143 +    /*#0 Load high residue.*/
   1.144 +    movq mm2,[1*8+RESIDUE]
   1.145 +    /*Set mm0 to 0x8000800080008000.*/
   1.146 +    psllw mm0,15
   1.147 +    /*#1 Load low residue.*/
   1.148 +    movq mm3,[2*8+RESIDUE]
   1.149 +    /*#1 Load high residue.*/
   1.150 +    movq mm4,[3*8+RESIDUE]
   1.151 +    /*Set mm0 to 0x0080008000800080.*/
   1.152 +    psrlw mm0,8
   1.153 +    /*#2 Load low residue.*/
   1.154 +    movq mm5,[4*8+RESIDUE]
   1.155 +    /*#2 Load high residue.*/
   1.156 +    movq mm6,[5*8+RESIDUE]
   1.157 +    /*#0 Bias low  residue.*/
   1.158 +    paddsw mm1,mm0
   1.159 +    /*#0 Bias high residue.*/
   1.160 +    paddsw mm2,mm0
   1.161 +    /*#0 Pack to byte.*/
   1.162 +    packuswb mm1,mm2
   1.163 +    /*#1 Bias low  residue.*/
   1.164 +    paddsw mm3,mm0
   1.165 +    /*#1 Bias high residue.*/
   1.166 +    paddsw mm4,mm0
   1.167 +    /*#1 Pack to byte.*/
   1.168 +    packuswb mm3,mm4
   1.169 +    /*#2 Bias low  residue.*/
   1.170 +    paddsw mm5,mm0
   1.171 +    /*#2 Bias high residue.*/
   1.172 +    paddsw mm6,mm0
   1.173 +    /*#2 Pack to byte.*/
   1.174 +    packuswb mm5,mm6
   1.175 +    /*#0 Write row.*/
   1.176 +    movq [DST],mm1
   1.177 +    /*#1 Write row.*/
   1.178 +    movq [DST+YSTRIDE],mm3
   1.179 +    /*#2 Write row.*/
   1.180 +    movq [DST+YSTRIDE*2],mm5
   1.181 +    /*#3 Load low residue.*/
   1.182 +    movq mm1,[6*8+RESIDUE]
   1.183 +    /*#3 Load high residue.*/
   1.184 +    movq mm2,[7*8+RESIDUE]
   1.185 +    /*#4 Load high residue.*/
   1.186 +    movq mm3,[8*8+RESIDUE]
   1.187 +    /*#4 Load high residue.*/
   1.188 +    movq mm4,[9*8+RESIDUE]
   1.189 +    /*#5 Load high residue.*/
   1.190 +    movq mm5,[10*8+RESIDUE]
   1.191 +    /*#5 Load high residue.*/
   1.192 +    movq mm6,[11*8+RESIDUE]
   1.193 +    /*#3 Bias low  residue.*/
   1.194 +    paddsw mm1,mm0
   1.195 +    /*#3 Bias high residue.*/
   1.196 +    paddsw mm2,mm0
   1.197 +    /*#3 Pack to byte.*/
   1.198 +    packuswb mm1,mm2
   1.199 +    /*#4 Bias low  residue.*/
   1.200 +    paddsw mm3,mm0
   1.201 +    /*#4 Bias high residue.*/
   1.202 +    paddsw mm4,mm0
   1.203 +    /*#4 Pack to byte.*/
   1.204 +    packuswb mm3,mm4
   1.205 +    /*#5 Bias low  residue.*/
   1.206 +    paddsw mm5,mm0
   1.207 +    /*#5 Bias high residue.*/
   1.208 +    paddsw mm6,mm0
   1.209 +    /*#5 Pack to byte.*/
   1.210 +    packuswb mm5,mm6
   1.211 +    /*#3 Write row.*/
   1.212 +    movq [DST+YSTRIDE3],mm1
   1.213 +    /*#4 Write row.*/
   1.214 +    movq [DST4],mm3
   1.215 +    /*#5 Write row.*/
   1.216 +    movq [DST4+YSTRIDE],mm5
   1.217 +    /*#6 Load low residue.*/
   1.218 +    movq mm1,[12*8+RESIDUE]
   1.219 +    /*#6 Load high residue.*/
   1.220 +    movq mm2,[13*8+RESIDUE]
   1.221 +    /*#7 Load low residue.*/
   1.222 +    movq mm3,[14*8+RESIDUE]
   1.223 +    /*#7 Load high residue.*/
   1.224 +    movq mm4,[15*8+RESIDUE]
   1.225 +    /*#6 Bias low  residue.*/
   1.226 +    paddsw mm1,mm0
   1.227 +    /*#6 Bias high residue.*/
   1.228 +    paddsw mm2,mm0
   1.229 +    /*#6 Pack to byte.*/
   1.230 +    packuswb mm1,mm2
   1.231 +    /*#7 Bias low  residue.*/
   1.232 +    paddsw mm3,mm0
   1.233 +    /*#7 Bias high residue.*/
   1.234 +    paddsw mm4,mm0
   1.235 +    /*#7 Pack to byte.*/
   1.236 +    packuswb mm3,mm4
   1.237 +    /*#6 Write row.*/
   1.238 +    movq [DST4+YSTRIDE*2],mm1
   1.239 +    /*#7 Write row.*/
   1.240 +    movq [DST4+YSTRIDE3],mm3
   1.241 +#undef DST
   1.242 +#undef DST4
   1.243 +#undef YSTRIDE
   1.244 +#undef YSTRIDE3
   1.245 +#undef RESIDUE
   1.246 +  }
   1.247 +}
   1.248 +
   1.249 +void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
   1.250 + int _ystride,const ogg_int16_t *_residue){
   1.251 +  int i;
   1.252 +  /*Zero mm0.*/
   1.253 +  __asm pxor mm0,mm0;
   1.254 +  for(i=4;i-->0;){
   1.255 +    __asm{
   1.256 +#define DST edx
   1.257 +#define SRC ecx
   1.258 +#define YSTRIDE edi
   1.259 +#define RESIDUE eax
   1.260 +      mov DST,_dst
   1.261 +      mov SRC,_src
   1.262 +      mov YSTRIDE,_ystride
   1.263 +      mov RESIDUE,_residue
   1.264 +      /*#0 Load source.*/
   1.265 +      movq mm3,[SRC]
   1.266 +      /*#1 Load source.*/
   1.267 +      movq mm7,[SRC+YSTRIDE]
   1.268 +      /*#0 Get copy of src.*/
   1.269 +      movq mm4,mm3
   1.270 +      /*#0 Expand high source.*/
   1.271 +      punpckhbw mm4,mm0
   1.272 +      /*#0 Expand low  source.*/
   1.273 +      punpcklbw mm3,mm0
   1.274 +      /*#0 Add residue high.*/
   1.275 +      paddsw mm4,[8+RESIDUE]
   1.276 +      /*#1 Get copy of src.*/
   1.277 +      movq mm2,mm7
   1.278 +      /*#0 Add residue low.*/
   1.279 +      paddsw  mm3,[RESIDUE]
   1.280 +      /*#1 Expand high source.*/
   1.281 +      punpckhbw mm2,mm0
   1.282 +      /*#0 Pack final row pixels.*/
   1.283 +      packuswb mm3,mm4
   1.284 +      /*#1 Expand low  source.*/
   1.285 +      punpcklbw mm7,mm0
   1.286 +      /*#1 Add residue low.*/
   1.287 +      paddsw mm7,[16+RESIDUE]
   1.288 +      /*#1 Add residue high.*/
   1.289 +      paddsw mm2,[24+RESIDUE]
   1.290 +      /*Advance residue.*/
   1.291 +      lea RESIDUE,[32+RESIDUE]
   1.292 +      /*#1 Pack final row pixels.*/
   1.293 +      packuswb mm7,mm2
   1.294 +      /*Advance src.*/
   1.295 +      lea SRC,[SRC+YSTRIDE*2]
   1.296 +      /*#0 Write row.*/
   1.297 +      movq [DST],mm3
   1.298 +      /*#1 Write row.*/
   1.299 +      movq [DST+YSTRIDE],mm7
   1.300 +      /*Advance dst.*/
   1.301 +      lea DST,[DST+YSTRIDE*2]
   1.302 +      mov _residue,RESIDUE
   1.303 +      mov _dst,DST
   1.304 +      mov _src,SRC
   1.305 +#undef DST
   1.306 +#undef SRC
   1.307 +#undef YSTRIDE
   1.308 +#undef RESIDUE
   1.309 +    }
   1.310 +  }
   1.311 +}
   1.312 +
   1.313 +void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
   1.314 + const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
   1.315 +  int i;
   1.316 +  /*Zero mm7.*/
   1.317 +  __asm pxor mm7,mm7;
   1.318 +  for(i=4;i-->0;){
   1.319 +    __asm{
   1.320 +#define SRC1 ecx
   1.321 +#define SRC2 edi
   1.322 +#define YSTRIDE esi
   1.323 +#define RESIDUE edx
   1.324 +#define DST eax
   1.325 +      mov YSTRIDE,_ystride
   1.326 +      mov DST,_dst
   1.327 +      mov RESIDUE,_residue
   1.328 +      mov SRC1,_src1
   1.329 +      mov SRC2,_src2
   1.330 +      /*#0 Load src1.*/
   1.331 +      movq mm0,[SRC1]
   1.332 +      /*#0 Load src2.*/
   1.333 +      movq mm2,[SRC2]
   1.334 +      /*#0 Copy src1.*/
   1.335 +      movq mm1,mm0
   1.336 +      /*#0 Copy src2.*/
   1.337 +      movq mm3,mm2
   1.338 +      /*#1 Load src1.*/
   1.339 +      movq mm4,[SRC1+YSTRIDE]
   1.340 +      /*#0 Unpack lower src1.*/
   1.341 +      punpcklbw mm0,mm7
   1.342 +      /*#1 Load src2.*/
   1.343 +      movq mm5,[SRC2+YSTRIDE]
   1.344 +      /*#0 Unpack higher src1.*/
   1.345 +      punpckhbw mm1,mm7
   1.346 +      /*#0 Unpack lower src2.*/
   1.347 +      punpcklbw mm2,mm7
   1.348 +      /*#0 Unpack higher src2.*/
   1.349 +      punpckhbw mm3,mm7
   1.350 +      /*Advance src1 ptr.*/
   1.351 +      lea SRC1,[SRC1+YSTRIDE*2]
   1.352 +      /*Advance src2 ptr.*/
   1.353 +      lea SRC2,[SRC2+YSTRIDE*2]
   1.354 +      /*#0 Lower src1+src2.*/
   1.355 +      paddsw mm0,mm2
   1.356 +      /*#0 Higher src1+src2.*/
   1.357 +      paddsw mm1,mm3
   1.358 +      /*#1 Copy src1.*/
   1.359 +      movq mm2,mm4
   1.360 +      /*#0 Build lo average.*/
   1.361 +      psraw mm0,1
   1.362 +      /*#1 Copy src2.*/
   1.363 +      movq mm3,mm5
   1.364 +      /*#1 Unpack lower src1.*/
   1.365 +      punpcklbw mm4,mm7
   1.366 +      /*#0 Build hi average.*/
   1.367 +      psraw mm1,1
   1.368 +      /*#1 Unpack higher src1.*/
   1.369 +      punpckhbw mm2,mm7
   1.370 +      /*#0 low+=residue.*/
   1.371 +      paddsw mm0,[RESIDUE]
   1.372 +      /*#1 Unpack lower src2.*/
   1.373 +      punpcklbw mm5,mm7
   1.374 +      /*#0 high+=residue.*/
   1.375 +      paddsw mm1,[8+RESIDUE]
   1.376 +      /*#1 Unpack higher src2.*/
   1.377 +      punpckhbw mm3,mm7
   1.378 +      /*#1 Lower src1+src2.*/
   1.379 +      paddsw mm5,mm4
   1.380 +      /*#0 Pack and saturate.*/
   1.381 +      packuswb mm0,mm1
   1.382 +      /*#1 Higher src1+src2.*/
   1.383 +      paddsw mm3,mm2
   1.384 +      /*#0 Write row.*/
   1.385 +      movq [DST],mm0
   1.386 +      /*#1 Build lo average.*/
   1.387 +      psraw mm5,1
   1.388 +      /*#1 Build hi average.*/
   1.389 +      psraw mm3,1
   1.390 +      /*#1 low+=residue.*/
   1.391 +      paddsw mm5,[16+RESIDUE]
   1.392 +      /*#1 high+=residue.*/
   1.393 +      paddsw mm3,[24+RESIDUE]
   1.394 +      /*#1 Pack and saturate.*/
   1.395 +      packuswb  mm5,mm3
   1.396 +      /*#1 Write row ptr.*/
   1.397 +      movq [DST+YSTRIDE],mm5
   1.398 +      /*Advance residue ptr.*/
   1.399 +      add RESIDUE,32
   1.400 +      /*Advance dest ptr.*/
   1.401 +      lea DST,[DST+YSTRIDE*2]
   1.402 +      mov _dst,DST
   1.403 +      mov _residue,RESIDUE
   1.404 +      mov _src1,SRC1
   1.405 +      mov _src2,SRC2
   1.406 +#undef SRC1
   1.407 +#undef SRC2
   1.408 +#undef YSTRIDE
   1.409 +#undef RESIDUE
   1.410 +#undef DST
   1.411 +    }
   1.412 +  }
   1.413 +}
   1.414 +
   1.415 +void oc_restore_fpu_mmx(void){
   1.416 +  __asm emms;
   1.417 +}
   1.418 +
   1.419 +#endif

mercurial