media/libtheora/lib/x86_vc/mmxfrag.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /********************************************************************
     2  *                                                                  *
     3  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
     4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
     5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
     6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
     7  *                                                                  *
     8  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
     9  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
    10  *                                                                  *
    11  ********************************************************************
    13   function:
    14     last mod: $Id: mmxfrag.c 17446 2010-09-23 20:06:20Z tterribe $
    16  ********************************************************************/
    18 /*MMX acceleration of fragment reconstruction for motion compensation.
    19   Originally written by Rudolf Marek.
    20   Additional optimization by Nils Pipenbrinck.
    21   Note: Loops are unrolled for best performance.
    22   The iteration each instruction belongs to is marked in the comments as #i.*/
    23 #include <stddef.h>
    24 #include "x86int.h"
    26 #if defined(OC_X86_ASM)
    28 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
    29    between rows.*/
    30 # define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
    31   do{ \
    32     const unsigned char *src; \
    33     unsigned char       *dst; \
    34     src=(_src); \
    35     dst=(_dst); \
    36     __asm  mov SRC,src \
    37     __asm  mov DST,dst \
    38     __asm  mov YSTRIDE,_ystride \
    39     /*src+0*ystride*/ \
    40     __asm  movq mm0,[SRC] \
    41     /*src+1*ystride*/ \
    42     __asm  movq mm1,[SRC+YSTRIDE] \
    43     /*ystride3=ystride*3*/ \
    44     __asm  lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
    45     /*src+2*ystride*/ \
    46     __asm  movq mm2,[SRC+YSTRIDE*2] \
    47     /*src+3*ystride*/ \
    48     __asm  movq mm3,[SRC+YSTRIDE3] \
    49     /*dst+0*ystride*/ \
    50     __asm  movq [DST],mm0 \
    51     /*dst+1*ystride*/ \
    52     __asm  movq [DST+YSTRIDE],mm1 \
    53     /*Pointer to next 4.*/ \
    54     __asm  lea SRC,[SRC+YSTRIDE*4] \
    55     /*dst+2*ystride*/ \
    56     __asm  movq [DST+YSTRIDE*2],mm2 \
    57     /*dst+3*ystride*/ \
    58     __asm  movq [DST+YSTRIDE3],mm3 \
    59     /*Pointer to next 4.*/ \
    60     __asm  lea DST,[DST+YSTRIDE*4] \
    61     /*src+0*ystride*/ \
    62     __asm  movq mm0,[SRC] \
    63     /*src+1*ystride*/ \
    64     __asm  movq mm1,[SRC+YSTRIDE] \
    65     /*src+2*ystride*/ \
    66     __asm  movq mm2,[SRC+YSTRIDE*2] \
    67     /*src+3*ystride*/ \
    68     __asm  movq mm3,[SRC+YSTRIDE3] \
    69     /*dst+0*ystride*/ \
    70     __asm  movq [DST],mm0 \
    71     /*dst+1*ystride*/ \
    72     __asm  movq [DST+YSTRIDE],mm1 \
    73     /*dst+2*ystride*/ \
    74     __asm  movq [DST+YSTRIDE*2],mm2 \
    75     /*dst+3*ystride*/ \
    76     __asm  movq [DST+YSTRIDE3],mm3 \
    77   } \
    78   while(0)
    80 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
    81    between rows.*/
    82 void oc_frag_copy_mmx(unsigned char *_dst,
    83  const unsigned char *_src,int _ystride){
    84 #define SRC edx
    85 #define DST eax
    86 #define YSTRIDE ecx
    87 #define YSTRIDE3 esi
    88   OC_FRAG_COPY_MMX(_dst,_src,_ystride);
    89 #undef SRC
    90 #undef DST
    91 #undef YSTRIDE
    92 #undef YSTRIDE3
    93 }
    95 /*Copies the fragments specified by the lists of fragment indices from one
    96    frame to another.
    97   _dst_frame:     The reference frame to copy to.
    98   _src_frame:     The reference frame to copy from.
    99   _ystride:       The row stride of the reference frames.
   100   _fragis:        A pointer to a list of fragment indices.
   101   _nfragis:       The number of fragment indices to copy.
   102   _frag_buf_offs: The offsets of fragments in the reference frames.*/
   103 void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
   104  const unsigned char *_src_frame,int _ystride,
   105  const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
   106   ptrdiff_t fragii;
   107   for(fragii=0;fragii<_nfragis;fragii++){
   108     ptrdiff_t frag_buf_off;
   109     frag_buf_off=_frag_buf_offs[_fragis[fragii]];
   110 #define SRC edx
   111 #define DST eax
   112 #define YSTRIDE ecx
   113 #define YSTRIDE3 edi
   114     OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
   115      _src_frame+frag_buf_off,_ystride);
   116 #undef SRC
   117 #undef DST
   118 #undef YSTRIDE
   119 #undef YSTRIDE3
   120   }
   121 }
   123 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
   124  const ogg_int16_t *_residue){
   125   __asm{
   126 #define DST edx
   127 #define DST4 esi
   128 #define YSTRIDE eax
   129 #define YSTRIDE3 edi
   130 #define RESIDUE ecx
   131     mov DST,_dst
   132     mov YSTRIDE,_ystride
   133     mov RESIDUE,_residue
   134     lea DST4,[DST+YSTRIDE*4]
   135     lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
   136     /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
   137     pcmpeqw mm0,mm0
   138     /*#0 Load low residue.*/
   139     movq mm1,[0*8+RESIDUE]
   140     /*#0 Load high residue.*/
   141     movq mm2,[1*8+RESIDUE]
   142     /*Set mm0 to 0x8000800080008000.*/
   143     psllw mm0,15
   144     /*#1 Load low residue.*/
   145     movq mm3,[2*8+RESIDUE]
   146     /*#1 Load high residue.*/
   147     movq mm4,[3*8+RESIDUE]
   148     /*Set mm0 to 0x0080008000800080.*/
   149     psrlw mm0,8
   150     /*#2 Load low residue.*/
   151     movq mm5,[4*8+RESIDUE]
   152     /*#2 Load high residue.*/
   153     movq mm6,[5*8+RESIDUE]
   154     /*#0 Bias low  residue.*/
   155     paddsw mm1,mm0
   156     /*#0 Bias high residue.*/
   157     paddsw mm2,mm0
   158     /*#0 Pack to byte.*/
   159     packuswb mm1,mm2
   160     /*#1 Bias low  residue.*/
   161     paddsw mm3,mm0
   162     /*#1 Bias high residue.*/
   163     paddsw mm4,mm0
   164     /*#1 Pack to byte.*/
   165     packuswb mm3,mm4
   166     /*#2 Bias low  residue.*/
   167     paddsw mm5,mm0
   168     /*#2 Bias high residue.*/
   169     paddsw mm6,mm0
   170     /*#2 Pack to byte.*/
   171     packuswb mm5,mm6
   172     /*#0 Write row.*/
   173     movq [DST],mm1
   174     /*#1 Write row.*/
   175     movq [DST+YSTRIDE],mm3
   176     /*#2 Write row.*/
   177     movq [DST+YSTRIDE*2],mm5
   178     /*#3 Load low residue.*/
   179     movq mm1,[6*8+RESIDUE]
   180     /*#3 Load high residue.*/
   181     movq mm2,[7*8+RESIDUE]
   182     /*#4 Load high residue.*/
   183     movq mm3,[8*8+RESIDUE]
   184     /*#4 Load high residue.*/
   185     movq mm4,[9*8+RESIDUE]
   186     /*#5 Load high residue.*/
   187     movq mm5,[10*8+RESIDUE]
   188     /*#5 Load high residue.*/
   189     movq mm6,[11*8+RESIDUE]
   190     /*#3 Bias low  residue.*/
   191     paddsw mm1,mm0
   192     /*#3 Bias high residue.*/
   193     paddsw mm2,mm0
   194     /*#3 Pack to byte.*/
   195     packuswb mm1,mm2
   196     /*#4 Bias low  residue.*/
   197     paddsw mm3,mm0
   198     /*#4 Bias high residue.*/
   199     paddsw mm4,mm0
   200     /*#4 Pack to byte.*/
   201     packuswb mm3,mm4
   202     /*#5 Bias low  residue.*/
   203     paddsw mm5,mm0
   204     /*#5 Bias high residue.*/
   205     paddsw mm6,mm0
   206     /*#5 Pack to byte.*/
   207     packuswb mm5,mm6
   208     /*#3 Write row.*/
   209     movq [DST+YSTRIDE3],mm1
   210     /*#4 Write row.*/
   211     movq [DST4],mm3
   212     /*#5 Write row.*/
   213     movq [DST4+YSTRIDE],mm5
   214     /*#6 Load low residue.*/
   215     movq mm1,[12*8+RESIDUE]
   216     /*#6 Load high residue.*/
   217     movq mm2,[13*8+RESIDUE]
   218     /*#7 Load low residue.*/
   219     movq mm3,[14*8+RESIDUE]
   220     /*#7 Load high residue.*/
   221     movq mm4,[15*8+RESIDUE]
   222     /*#6 Bias low  residue.*/
   223     paddsw mm1,mm0
   224     /*#6 Bias high residue.*/
   225     paddsw mm2,mm0
   226     /*#6 Pack to byte.*/
   227     packuswb mm1,mm2
   228     /*#7 Bias low  residue.*/
   229     paddsw mm3,mm0
   230     /*#7 Bias high residue.*/
   231     paddsw mm4,mm0
   232     /*#7 Pack to byte.*/
   233     packuswb mm3,mm4
   234     /*#6 Write row.*/
   235     movq [DST4+YSTRIDE*2],mm1
   236     /*#7 Write row.*/
   237     movq [DST4+YSTRIDE3],mm3
   238 #undef DST
   239 #undef DST4
   240 #undef YSTRIDE
   241 #undef YSTRIDE3
   242 #undef RESIDUE
   243   }
   244 }
   246 void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
   247  int _ystride,const ogg_int16_t *_residue){
   248   int i;
   249   /*Zero mm0.*/
   250   __asm pxor mm0,mm0;
   251   for(i=4;i-->0;){
   252     __asm{
   253 #define DST edx
   254 #define SRC ecx
   255 #define YSTRIDE edi
   256 #define RESIDUE eax
   257       mov DST,_dst
   258       mov SRC,_src
   259       mov YSTRIDE,_ystride
   260       mov RESIDUE,_residue
   261       /*#0 Load source.*/
   262       movq mm3,[SRC]
   263       /*#1 Load source.*/
   264       movq mm7,[SRC+YSTRIDE]
   265       /*#0 Get copy of src.*/
   266       movq mm4,mm3
   267       /*#0 Expand high source.*/
   268       punpckhbw mm4,mm0
   269       /*#0 Expand low  source.*/
   270       punpcklbw mm3,mm0
   271       /*#0 Add residue high.*/
   272       paddsw mm4,[8+RESIDUE]
   273       /*#1 Get copy of src.*/
   274       movq mm2,mm7
   275       /*#0 Add residue low.*/
   276       paddsw  mm3,[RESIDUE]
   277       /*#1 Expand high source.*/
   278       punpckhbw mm2,mm0
   279       /*#0 Pack final row pixels.*/
   280       packuswb mm3,mm4
   281       /*#1 Expand low  source.*/
   282       punpcklbw mm7,mm0
   283       /*#1 Add residue low.*/
   284       paddsw mm7,[16+RESIDUE]
   285       /*#1 Add residue high.*/
   286       paddsw mm2,[24+RESIDUE]
   287       /*Advance residue.*/
   288       lea RESIDUE,[32+RESIDUE]
   289       /*#1 Pack final row pixels.*/
   290       packuswb mm7,mm2
   291       /*Advance src.*/
   292       lea SRC,[SRC+YSTRIDE*2]
   293       /*#0 Write row.*/
   294       movq [DST],mm3
   295       /*#1 Write row.*/
   296       movq [DST+YSTRIDE],mm7
   297       /*Advance dst.*/
   298       lea DST,[DST+YSTRIDE*2]
   299       mov _residue,RESIDUE
   300       mov _dst,DST
   301       mov _src,SRC
   302 #undef DST
   303 #undef SRC
   304 #undef YSTRIDE
   305 #undef RESIDUE
   306     }
   307   }
   308 }
   310 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
   311  const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
   312   int i;
   313   /*Zero mm7.*/
   314   __asm pxor mm7,mm7;
   315   for(i=4;i-->0;){
   316     __asm{
   317 #define SRC1 ecx
   318 #define SRC2 edi
   319 #define YSTRIDE esi
   320 #define RESIDUE edx
   321 #define DST eax
   322       mov YSTRIDE,_ystride
   323       mov DST,_dst
   324       mov RESIDUE,_residue
   325       mov SRC1,_src1
   326       mov SRC2,_src2
   327       /*#0 Load src1.*/
   328       movq mm0,[SRC1]
   329       /*#0 Load src2.*/
   330       movq mm2,[SRC2]
   331       /*#0 Copy src1.*/
   332       movq mm1,mm0
   333       /*#0 Copy src2.*/
   334       movq mm3,mm2
   335       /*#1 Load src1.*/
   336       movq mm4,[SRC1+YSTRIDE]
   337       /*#0 Unpack lower src1.*/
   338       punpcklbw mm0,mm7
   339       /*#1 Load src2.*/
   340       movq mm5,[SRC2+YSTRIDE]
   341       /*#0 Unpack higher src1.*/
   342       punpckhbw mm1,mm7
   343       /*#0 Unpack lower src2.*/
   344       punpcklbw mm2,mm7
   345       /*#0 Unpack higher src2.*/
   346       punpckhbw mm3,mm7
   347       /*Advance src1 ptr.*/
   348       lea SRC1,[SRC1+YSTRIDE*2]
   349       /*Advance src2 ptr.*/
   350       lea SRC2,[SRC2+YSTRIDE*2]
   351       /*#0 Lower src1+src2.*/
   352       paddsw mm0,mm2
   353       /*#0 Higher src1+src2.*/
   354       paddsw mm1,mm3
   355       /*#1 Copy src1.*/
   356       movq mm2,mm4
   357       /*#0 Build lo average.*/
   358       psraw mm0,1
   359       /*#1 Copy src2.*/
   360       movq mm3,mm5
   361       /*#1 Unpack lower src1.*/
   362       punpcklbw mm4,mm7
   363       /*#0 Build hi average.*/
   364       psraw mm1,1
   365       /*#1 Unpack higher src1.*/
   366       punpckhbw mm2,mm7
   367       /*#0 low+=residue.*/
   368       paddsw mm0,[RESIDUE]
   369       /*#1 Unpack lower src2.*/
   370       punpcklbw mm5,mm7
   371       /*#0 high+=residue.*/
   372       paddsw mm1,[8+RESIDUE]
   373       /*#1 Unpack higher src2.*/
   374       punpckhbw mm3,mm7
   375       /*#1 Lower src1+src2.*/
   376       paddsw mm5,mm4
   377       /*#0 Pack and saturate.*/
   378       packuswb mm0,mm1
   379       /*#1 Higher src1+src2.*/
   380       paddsw mm3,mm2
   381       /*#0 Write row.*/
   382       movq [DST],mm0
   383       /*#1 Build lo average.*/
   384       psraw mm5,1
   385       /*#1 Build hi average.*/
   386       psraw mm3,1
   387       /*#1 low+=residue.*/
   388       paddsw mm5,[16+RESIDUE]
   389       /*#1 high+=residue.*/
   390       paddsw mm3,[24+RESIDUE]
   391       /*#1 Pack and saturate.*/
   392       packuswb  mm5,mm3
   393       /*#1 Write row ptr.*/
   394       movq [DST+YSTRIDE],mm5
   395       /*Advance residue ptr.*/
   396       add RESIDUE,32
   397       /*Advance dest ptr.*/
   398       lea DST,[DST+YSTRIDE*2]
   399       mov _dst,DST
   400       mov _residue,RESIDUE
   401       mov _src1,SRC1
   402       mov _src2,SRC2
   403 #undef SRC1
   404 #undef SRC2
   405 #undef YSTRIDE
   406 #undef RESIDUE
   407 #undef DST
   408     }
   409   }
   410 }
   412 void oc_restore_fpu_mmx(void){
   413   __asm emms;
   414 }
   416 #endif

mercurial