media/libtheora/lib/x86/mmxfrag.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libtheora/lib/x86/mmxfrag.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,368 @@
     1.4 +/********************************************************************
     1.5 + *                                                                  *
     1.6 + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
     1.7 + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
     1.8 + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
     1.9 + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
    1.10 + *                                                                  *
    1.11 + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
    1.12 + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
    1.13 + *                                                                  *
    1.14 + ********************************************************************
    1.15 +
    1.16 +  function:
    1.17 +    last mod: $Id: mmxfrag.c 17410 2010-09-21 21:53:48Z tterribe $
    1.18 +
    1.19 + ********************************************************************/
    1.20 +
    1.21 +/*MMX acceleration of fragment reconstruction for motion compensation.
    1.22 +  Originally written by Rudolf Marek.
    1.23 +  Additional optimization by Nils Pipenbrinck.
    1.24 +  Note: Loops are unrolled for best performance.
    1.25 +  The iteration each instruction belongs to is marked in the comments as #i.*/
    1.26 +#include <stddef.h>
    1.27 +#include "x86int.h"
    1.28 +
    1.29 +#if defined(OC_X86_ASM)
    1.30 +
    1.31 +/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
    1.32 +   between rows.*/
    1.33 +# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
    1.34 +  do{ \
    1.35 +    const unsigned char *src; \
    1.36 +    unsigned char       *dst; \
    1.37 +    ptrdiff_t            ystride3; \
    1.38 +    src=(_src); \
    1.39 +    dst=(_dst); \
    1.40 +    __asm__ __volatile__( \
    1.41 +      /*src+0*ystride*/ \
    1.42 +      "movq (%[src]),%%mm0\n\t" \
    1.43 +      /*src+1*ystride*/ \
    1.44 +      "movq (%[src],%[ystride]),%%mm1\n\t" \
    1.45 +      /*ystride3=ystride*3*/ \
    1.46 +      "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
    1.47 +      /*src+2*ystride*/ \
    1.48 +      "movq (%[src],%[ystride],2),%%mm2\n\t" \
    1.49 +      /*src+3*ystride*/ \
    1.50 +      "movq (%[src],%[ystride3]),%%mm3\n\t" \
    1.51 +      /*dst+0*ystride*/ \
    1.52 +      "movq %%mm0,(%[dst])\n\t" \
    1.53 +      /*dst+1*ystride*/ \
    1.54 +      "movq %%mm1,(%[dst],%[ystride])\n\t" \
    1.55 +      /*Pointer to next 4.*/ \
    1.56 +      "lea (%[src],%[ystride],4),%[src]\n\t" \
    1.57 +      /*dst+2*ystride*/ \
    1.58 +      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
    1.59 +      /*dst+3*ystride*/ \
    1.60 +      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
    1.61 +      /*Pointer to next 4.*/ \
    1.62 +      "lea (%[dst],%[ystride],4),%[dst]\n\t" \
    1.63 +      /*src+0*ystride*/ \
    1.64 +      "movq (%[src]),%%mm0\n\t" \
    1.65 +      /*src+1*ystride*/ \
    1.66 +      "movq (%[src],%[ystride]),%%mm1\n\t" \
    1.67 +      /*src+2*ystride*/ \
    1.68 +      "movq (%[src],%[ystride],2),%%mm2\n\t" \
    1.69 +      /*src+3*ystride*/ \
    1.70 +      "movq (%[src],%[ystride3]),%%mm3\n\t" \
    1.71 +      /*dst+0*ystride*/ \
    1.72 +      "movq %%mm0,(%[dst])\n\t" \
    1.73 +      /*dst+1*ystride*/ \
    1.74 +      "movq %%mm1,(%[dst],%[ystride])\n\t" \
    1.75 +      /*dst+2*ystride*/ \
    1.76 +      "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
    1.77 +      /*dst+3*ystride*/ \
    1.78 +      "movq %%mm3,(%[dst],%[ystride3])\n\t" \
    1.79 +      :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
    1.80 +      :[ystride]"r"((ptrdiff_t)(_ystride)) \
    1.81 +      :"memory" \
    1.82 +    ); \
    1.83 +  } \
    1.84 +  while(0)
    1.85 +
    1.86 +/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
    1.87 +   between rows.*/
    1.88 +void oc_frag_copy_mmx(unsigned char *_dst,
    1.89 + const unsigned char *_src,int _ystride){
    1.90 +  OC_FRAG_COPY_MMX(_dst,_src,_ystride);
    1.91 +}
    1.92 +
    1.93 +/*Copies the fragments specified by the lists of fragment indices from one
    1.94 +   frame to another.
    1.95 +  _dst_frame:     The reference frame to copy to.
    1.96 +  _src_frame:     The reference frame to copy from.
    1.97 +  _ystride:       The row stride of the reference frames.
    1.98 +  _fragis:        A pointer to a list of fragment indices.
    1.99 +  _nfragis:       The number of fragment indices to copy.
   1.100 +  _frag_buf_offs: The offsets of fragments in the reference frames.*/
   1.101 +void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
   1.102 + const unsigned char *_src_frame,int _ystride,
   1.103 + const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
   1.104 +  ptrdiff_t fragii;
   1.105 +  for(fragii=0;fragii<_nfragis;fragii++){
   1.106 +    ptrdiff_t frag_buf_off;
   1.107 +    frag_buf_off=_frag_buf_offs[_fragis[fragii]];
   1.108 +    OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
   1.109 +     _src_frame+frag_buf_off,_ystride);
   1.110 +  }
   1.111 +}
   1.112 +
   1.113 +
   1.114 +void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
   1.115 + const ogg_int16_t *_residue){
   1.116 +  __asm__ __volatile__(
   1.117 +    /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
   1.118 +    "pcmpeqw %%mm0,%%mm0\n\t"
   1.119 +    /*#0 Load low residue.*/
   1.120 +    "movq 0*8(%[residue]),%%mm1\n\t"
   1.121 +    /*#0 Load high residue.*/
   1.122 +    "movq 1*8(%[residue]),%%mm2\n\t"
   1.123 +    /*Set mm0 to 0x8000800080008000.*/
   1.124 +    "psllw $15,%%mm0\n\t"
   1.125 +    /*#1 Load low residue.*/
   1.126 +    "movq 2*8(%[residue]),%%mm3\n\t"
   1.127 +    /*#1 Load high residue.*/
   1.128 +    "movq 3*8(%[residue]),%%mm4\n\t"
   1.129 +    /*Set mm0 to 0x0080008000800080.*/
   1.130 +    "psrlw $8,%%mm0\n\t"
   1.131 +    /*#2 Load low residue.*/
   1.132 +    "movq 4*8(%[residue]),%%mm5\n\t"
   1.133 +    /*#2 Load high residue.*/
   1.134 +    "movq 5*8(%[residue]),%%mm6\n\t"
   1.135 +    /*#0 Bias low  residue.*/
   1.136 +    "paddsw %%mm0,%%mm1\n\t"
   1.137 +    /*#0 Bias high residue.*/
   1.138 +    "paddsw %%mm0,%%mm2\n\t"
   1.139 +    /*#0 Pack to byte.*/
   1.140 +    "packuswb %%mm2,%%mm1\n\t"
   1.141 +    /*#1 Bias low  residue.*/
   1.142 +    "paddsw %%mm0,%%mm3\n\t"
   1.143 +    /*#1 Bias high residue.*/
   1.144 +    "paddsw %%mm0,%%mm4\n\t"
   1.145 +    /*#1 Pack to byte.*/
   1.146 +    "packuswb %%mm4,%%mm3\n\t"
   1.147 +    /*#2 Bias low  residue.*/
   1.148 +    "paddsw %%mm0,%%mm5\n\t"
   1.149 +    /*#2 Bias high residue.*/
   1.150 +    "paddsw %%mm0,%%mm6\n\t"
   1.151 +    /*#2 Pack to byte.*/
   1.152 +    "packuswb %%mm6,%%mm5\n\t"
   1.153 +    /*#0 Write row.*/
   1.154 +    "movq %%mm1,(%[dst])\n\t"
   1.155 +    /*#1 Write row.*/
   1.156 +    "movq %%mm3,(%[dst],%[ystride])\n\t"
   1.157 +    /*#2 Write row.*/
   1.158 +    "movq %%mm5,(%[dst],%[ystride],2)\n\t"
   1.159 +    /*#3 Load low residue.*/
   1.160 +    "movq 6*8(%[residue]),%%mm1\n\t"
   1.161 +    /*#3 Load high residue.*/
   1.162 +    "movq 7*8(%[residue]),%%mm2\n\t"
   1.163 +    /*#4 Load high residue.*/
   1.164 +    "movq 8*8(%[residue]),%%mm3\n\t"
   1.165 +    /*#4 Load high residue.*/
   1.166 +    "movq 9*8(%[residue]),%%mm4\n\t"
   1.167 +    /*#5 Load high residue.*/
   1.168 +    "movq 10*8(%[residue]),%%mm5\n\t"
   1.169 +    /*#5 Load high residue.*/
   1.170 +    "movq 11*8(%[residue]),%%mm6\n\t"
   1.171 +    /*#3 Bias low  residue.*/
   1.172 +    "paddsw %%mm0,%%mm1\n\t"
   1.173 +    /*#3 Bias high residue.*/
   1.174 +    "paddsw %%mm0,%%mm2\n\t"
   1.175 +    /*#3 Pack to byte.*/
   1.176 +    "packuswb %%mm2,%%mm1\n\t"
   1.177 +    /*#4 Bias low  residue.*/
   1.178 +    "paddsw %%mm0,%%mm3\n\t"
   1.179 +    /*#4 Bias high residue.*/
   1.180 +    "paddsw %%mm0,%%mm4\n\t"
   1.181 +    /*#4 Pack to byte.*/
   1.182 +    "packuswb %%mm4,%%mm3\n\t"
   1.183 +    /*#5 Bias low  residue.*/
   1.184 +    "paddsw %%mm0,%%mm5\n\t"
   1.185 +    /*#5 Bias high residue.*/
   1.186 +    "paddsw %%mm0,%%mm6\n\t"
   1.187 +    /*#5 Pack to byte.*/
   1.188 +    "packuswb %%mm6,%%mm5\n\t"
   1.189 +    /*#3 Write row.*/
   1.190 +    "movq %%mm1,(%[dst],%[ystride3])\n\t"
   1.191 +    /*#4 Write row.*/
   1.192 +    "movq %%mm3,(%[dst4])\n\t"
   1.193 +    /*#5 Write row.*/
   1.194 +    "movq %%mm5,(%[dst4],%[ystride])\n\t"
   1.195 +    /*#6 Load low residue.*/
   1.196 +    "movq 12*8(%[residue]),%%mm1\n\t"
   1.197 +    /*#6 Load high residue.*/
   1.198 +    "movq 13*8(%[residue]),%%mm2\n\t"
   1.199 +    /*#7 Load low residue.*/
   1.200 +    "movq 14*8(%[residue]),%%mm3\n\t"
   1.201 +    /*#7 Load high residue.*/
   1.202 +    "movq 15*8(%[residue]),%%mm4\n\t"
   1.203 +    /*#6 Bias low  residue.*/
   1.204 +    "paddsw %%mm0,%%mm1\n\t"
   1.205 +    /*#6 Bias high residue.*/
   1.206 +    "paddsw %%mm0,%%mm2\n\t"
   1.207 +    /*#6 Pack to byte.*/
   1.208 +    "packuswb %%mm2,%%mm1\n\t"
   1.209 +    /*#7 Bias low  residue.*/
   1.210 +    "paddsw %%mm0,%%mm3\n\t"
   1.211 +    /*#7 Bias high residue.*/
   1.212 +    "paddsw %%mm0,%%mm4\n\t"
   1.213 +    /*#7 Pack to byte.*/
   1.214 +    "packuswb %%mm4,%%mm3\n\t"
   1.215 +    /*#6 Write row.*/
   1.216 +    "movq %%mm1,(%[dst4],%[ystride],2)\n\t"
   1.217 +    /*#7 Write row.*/
   1.218 +    "movq %%mm3,(%[dst4],%[ystride3])\n\t"
   1.219 +    :
   1.220 +    :[residue]"r"(_residue),
   1.221 +     [dst]"r"(_dst),
   1.222 +     [dst4]"r"(_dst+(_ystride<<2)),
   1.223 +     [ystride]"r"((ptrdiff_t)_ystride),
   1.224 +     [ystride3]"r"((ptrdiff_t)_ystride*3)
   1.225 +    :"memory"
   1.226 +  );
   1.227 +}
   1.228 +
   1.229 +void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
   1.230 + int _ystride,const ogg_int16_t *_residue){
   1.231 +  int i;
   1.232 +  /*Zero mm0.*/
   1.233 +  __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
   1.234 +  for(i=4;i-->0;){
   1.235 +    __asm__ __volatile__(
   1.236 +      /*#0 Load source.*/
   1.237 +      "movq (%[src]),%%mm3\n\t"
   1.238 +      /*#1 Load source.*/
   1.239 +      "movq (%[src],%[ystride]),%%mm7\n\t"
   1.240 +      /*#0 Get copy of src.*/
   1.241 +      "movq %%mm3,%%mm4\n\t"
   1.242 +      /*#0 Expand high source.*/
   1.243 +      "punpckhbw %%mm0,%%mm4\n\t"
   1.244 +      /*#0 Expand low  source.*/
   1.245 +      "punpcklbw %%mm0,%%mm3\n\t"
   1.246 +      /*#0 Add residue high.*/
   1.247 +      "paddsw 8(%[residue]),%%mm4\n\t"
   1.248 +      /*#1 Get copy of src.*/
   1.249 +      "movq %%mm7,%%mm2\n\t"
   1.250 +      /*#0 Add residue low.*/
   1.251 +      "paddsw (%[residue]), %%mm3\n\t"
   1.252 +      /*#1 Expand high source.*/
   1.253 +      "punpckhbw %%mm0,%%mm2\n\t"
   1.254 +      /*#0 Pack final row pixels.*/
   1.255 +      "packuswb %%mm4,%%mm3\n\t"
   1.256 +      /*#1 Expand low  source.*/
   1.257 +      "punpcklbw %%mm0,%%mm7\n\t"
   1.258 +      /*#1 Add residue low.*/
   1.259 +      "paddsw 16(%[residue]),%%mm7\n\t"
   1.260 +      /*#1 Add residue high.*/
   1.261 +      "paddsw 24(%[residue]),%%mm2\n\t"
   1.262 +      /*Advance residue.*/
   1.263 +      "lea 32(%[residue]),%[residue]\n\t"
   1.264 +      /*#1 Pack final row pixels.*/
   1.265 +      "packuswb %%mm2,%%mm7\n\t"
   1.266 +      /*Advance src.*/
   1.267 +      "lea (%[src],%[ystride],2),%[src]\n\t"
   1.268 +      /*#0 Write row.*/
   1.269 +      "movq %%mm3,(%[dst])\n\t"
   1.270 +      /*#1 Write row.*/
   1.271 +      "movq %%mm7,(%[dst],%[ystride])\n\t"
   1.272 +      /*Advance dst.*/
   1.273 +      "lea (%[dst],%[ystride],2),%[dst]\n\t"
   1.274 +      :[residue]"+r"(_residue),[dst]"+r"(_dst),[src]"+r"(_src)
   1.275 +      :[ystride]"r"((ptrdiff_t)_ystride)
   1.276 +      :"memory"
   1.277 +    );
   1.278 +  }
   1.279 +}
   1.280 +
   1.281 +void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
   1.282 + const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
   1.283 +  int i;
   1.284 +  /*Zero mm7.*/
   1.285 +  __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
   1.286 +  for(i=4;i-->0;){
   1.287 +    __asm__ __volatile__(
   1.288 +      /*#0 Load src1.*/
   1.289 +      "movq (%[src1]),%%mm0\n\t"
   1.290 +      /*#0 Load src2.*/
   1.291 +      "movq (%[src2]),%%mm2\n\t"
   1.292 +      /*#0 Copy src1.*/
   1.293 +      "movq %%mm0,%%mm1\n\t"
   1.294 +      /*#0 Copy src2.*/
   1.295 +      "movq %%mm2,%%mm3\n\t"
   1.296 +      /*#1 Load src1.*/
   1.297 +      "movq (%[src1],%[ystride]),%%mm4\n\t"
   1.298 +      /*#0 Unpack lower src1.*/
   1.299 +      "punpcklbw %%mm7,%%mm0\n\t"
   1.300 +      /*#1 Load src2.*/
   1.301 +      "movq (%[src2],%[ystride]),%%mm5\n\t"
   1.302 +      /*#0 Unpack higher src1.*/
   1.303 +      "punpckhbw %%mm7,%%mm1\n\t"
   1.304 +      /*#0 Unpack lower src2.*/
   1.305 +      "punpcklbw %%mm7,%%mm2\n\t"
   1.306 +      /*#0 Unpack higher src2.*/
   1.307 +      "punpckhbw %%mm7,%%mm3\n\t"
   1.308 +      /*Advance src1 ptr.*/
   1.309 +      "lea (%[src1],%[ystride],2),%[src1]\n\t"
   1.310 +      /*Advance src2 ptr.*/
   1.311 +      "lea (%[src2],%[ystride],2),%[src2]\n\t"
   1.312 +      /*#0 Lower src1+src2.*/
   1.313 +      "paddsw %%mm2,%%mm0\n\t"
   1.314 +      /*#0 Higher src1+src2.*/
   1.315 +      "paddsw %%mm3,%%mm1\n\t"
   1.316 +      /*#1 Copy src1.*/
   1.317 +      "movq %%mm4,%%mm2\n\t"
   1.318 +      /*#0 Build lo average.*/
   1.319 +      "psraw $1,%%mm0\n\t"
   1.320 +      /*#1 Copy src2.*/
   1.321 +      "movq %%mm5,%%mm3\n\t"
   1.322 +      /*#1 Unpack lower src1.*/
   1.323 +      "punpcklbw %%mm7,%%mm4\n\t"
   1.324 +      /*#0 Build hi average.*/
   1.325 +      "psraw $1,%%mm1\n\t"
   1.326 +      /*#1 Unpack higher src1.*/
   1.327 +      "punpckhbw %%mm7,%%mm2\n\t"
   1.328 +      /*#0 low+=residue.*/
   1.329 +      "paddsw (%[residue]),%%mm0\n\t"
   1.330 +      /*#1 Unpack lower src2.*/
   1.331 +      "punpcklbw %%mm7,%%mm5\n\t"
   1.332 +      /*#0 high+=residue.*/
   1.333 +      "paddsw 8(%[residue]),%%mm1\n\t"
   1.334 +      /*#1 Unpack higher src2.*/
   1.335 +      "punpckhbw %%mm7,%%mm3\n\t"
   1.336 +      /*#1 Lower src1+src2.*/
   1.337 +      "paddsw %%mm4,%%mm5\n\t"
   1.338 +      /*#0 Pack and saturate.*/
   1.339 +      "packuswb %%mm1,%%mm0\n\t"
   1.340 +      /*#1 Higher src1+src2.*/
   1.341 +      "paddsw %%mm2,%%mm3\n\t"
   1.342 +      /*#0 Write row.*/
   1.343 +      "movq %%mm0,(%[dst])\n\t"
   1.344 +      /*#1 Build lo average.*/
   1.345 +      "psraw $1,%%mm5\n\t"
   1.346 +      /*#1 Build hi average.*/
   1.347 +      "psraw $1,%%mm3\n\t"
   1.348 +      /*#1 low+=residue.*/
   1.349 +      "paddsw 16(%[residue]),%%mm5\n\t"
   1.350 +      /*#1 high+=residue.*/
   1.351 +      "paddsw 24(%[residue]),%%mm3\n\t"
   1.352 +      /*#1 Pack and saturate.*/
   1.353 +      "packuswb  %%mm3,%%mm5\n\t"
   1.354 +      /*#1 Write row ptr.*/
   1.355 +      "movq %%mm5,(%[dst],%[ystride])\n\t"
   1.356 +      /*Advance residue ptr.*/
   1.357 +      "add $32,%[residue]\n\t"
   1.358 +      /*Advance dest ptr.*/
   1.359 +      "lea (%[dst],%[ystride],2),%[dst]\n\t"
   1.360 +     :[dst]"+r"(_dst),[residue]"+r"(_residue),
   1.361 +      [src1]"+%r"(_src1),[src2]"+r"(_src2)
   1.362 +     :[ystride]"r"((ptrdiff_t)_ystride)
   1.363 +     :"memory"
   1.364 +    );
   1.365 +  }
   1.366 +}
   1.367 +
   1.368 +void oc_restore_fpu_mmx(void){
   1.369 +  __asm__ __volatile__("emms\n\t");
   1.370 +}
   1.371 +#endif

mercurial