media/libtheora/lib/x86/mmxfrag.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /********************************************************************
michael@0 2 * *
michael@0 3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
michael@0 4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
michael@0 5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
michael@0 6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
michael@0 7 * *
michael@0 8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
michael@0 9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
michael@0 10 * *
michael@0 11 ********************************************************************
michael@0 12
michael@0 13 function:
michael@0 14 last mod: $Id: mmxfrag.c 17410 2010-09-21 21:53:48Z tterribe $
michael@0 15
michael@0 16 ********************************************************************/
michael@0 17
michael@0 18 /*MMX acceleration of fragment reconstruction for motion compensation.
michael@0 19 Originally written by Rudolf Marek.
michael@0 20 Additional optimization by Nils Pipenbrinck.
michael@0 21 Note: Loops are unrolled for best performance.
michael@0 22 The iteration each instruction belongs to is marked in the comments as #i.*/
michael@0 23 #include <stddef.h>
michael@0 24 #include "x86int.h"
michael@0 25
michael@0 26 #if defined(OC_X86_ASM)
michael@0 27
michael@0 28 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
michael@0 29 between rows.*/
michael@0 30 # define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
michael@0 31 do{ \
michael@0 32 const unsigned char *src; \
michael@0 33 unsigned char *dst; \
michael@0 34 ptrdiff_t ystride3; \
michael@0 35 src=(_src); \
michael@0 36 dst=(_dst); \
michael@0 37 __asm__ __volatile__( \
michael@0 38 /*src+0*ystride*/ \
michael@0 39 "movq (%[src]),%%mm0\n\t" \
michael@0 40 /*src+1*ystride*/ \
michael@0 41 "movq (%[src],%[ystride]),%%mm1\n\t" \
michael@0 42 /*ystride3=ystride*3*/ \
michael@0 43 "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
michael@0 44 /*src+2*ystride*/ \
michael@0 45 "movq (%[src],%[ystride],2),%%mm2\n\t" \
michael@0 46 /*src+3*ystride*/ \
michael@0 47 "movq (%[src],%[ystride3]),%%mm3\n\t" \
michael@0 48 /*dst+0*ystride*/ \
michael@0 49 "movq %%mm0,(%[dst])\n\t" \
michael@0 50 /*dst+1*ystride*/ \
michael@0 51 "movq %%mm1,(%[dst],%[ystride])\n\t" \
michael@0 52 /*Pointer to next 4.*/ \
michael@0 53 "lea (%[src],%[ystride],4),%[src]\n\t" \
michael@0 54 /*dst+2*ystride*/ \
michael@0 55 "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
michael@0 56 /*dst+3*ystride*/ \
michael@0 57 "movq %%mm3,(%[dst],%[ystride3])\n\t" \
michael@0 58 /*Pointer to next 4.*/ \
michael@0 59 "lea (%[dst],%[ystride],4),%[dst]\n\t" \
michael@0 60 /*src+0*ystride*/ \
michael@0 61 "movq (%[src]),%%mm0\n\t" \
michael@0 62 /*src+1*ystride*/ \
michael@0 63 "movq (%[src],%[ystride]),%%mm1\n\t" \
michael@0 64 /*src+2*ystride*/ \
michael@0 65 "movq (%[src],%[ystride],2),%%mm2\n\t" \
michael@0 66 /*src+3*ystride*/ \
michael@0 67 "movq (%[src],%[ystride3]),%%mm3\n\t" \
michael@0 68 /*dst+0*ystride*/ \
michael@0 69 "movq %%mm0,(%[dst])\n\t" \
michael@0 70 /*dst+1*ystride*/ \
michael@0 71 "movq %%mm1,(%[dst],%[ystride])\n\t" \
michael@0 72 /*dst+2*ystride*/ \
michael@0 73 "movq %%mm2,(%[dst],%[ystride],2)\n\t" \
michael@0 74 /*dst+3*ystride*/ \
michael@0 75 "movq %%mm3,(%[dst],%[ystride3])\n\t" \
michael@0 76 :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
michael@0 77 :[ystride]"r"((ptrdiff_t)(_ystride)) \
michael@0 78 :"memory" \
michael@0 79 ); \
michael@0 80 } \
michael@0 81 while(0)
michael@0 82
michael@0 83 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
michael@0 84 between rows.*/
michael@0 85 void oc_frag_copy_mmx(unsigned char *_dst,
michael@0 86 const unsigned char *_src,int _ystride){
michael@0 87 OC_FRAG_COPY_MMX(_dst,_src,_ystride);
michael@0 88 }
michael@0 89
michael@0 90 /*Copies the fragments specified by the lists of fragment indices from one
michael@0 91 frame to another.
michael@0 92 _dst_frame: The reference frame to copy to.
michael@0 93 _src_frame: The reference frame to copy from.
michael@0 94 _ystride: The row stride of the reference frames.
michael@0 95 _fragis: A pointer to a list of fragment indices.
michael@0 96 _nfragis: The number of fragment indices to copy.
michael@0 97 _frag_buf_offs: The offsets of fragments in the reference frames.*/
michael@0 98 void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
michael@0 99 const unsigned char *_src_frame,int _ystride,
michael@0 100 const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
michael@0 101 ptrdiff_t fragii;
michael@0 102 for(fragii=0;fragii<_nfragis;fragii++){
michael@0 103 ptrdiff_t frag_buf_off;
michael@0 104 frag_buf_off=_frag_buf_offs[_fragis[fragii]];
michael@0 105 OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
michael@0 106 _src_frame+frag_buf_off,_ystride);
michael@0 107 }
michael@0 108 }
michael@0 109
michael@0 110
michael@0 111 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
michael@0 112 const ogg_int16_t *_residue){
michael@0 113 __asm__ __volatile__(
michael@0 114 /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
michael@0 115 "pcmpeqw %%mm0,%%mm0\n\t"
michael@0 116 /*#0 Load low residue.*/
michael@0 117 "movq 0*8(%[residue]),%%mm1\n\t"
michael@0 118 /*#0 Load high residue.*/
michael@0 119 "movq 1*8(%[residue]),%%mm2\n\t"
michael@0 120 /*Set mm0 to 0x8000800080008000.*/
michael@0 121 "psllw $15,%%mm0\n\t"
michael@0 122 /*#1 Load low residue.*/
michael@0 123 "movq 2*8(%[residue]),%%mm3\n\t"
michael@0 124 /*#1 Load high residue.*/
michael@0 125 "movq 3*8(%[residue]),%%mm4\n\t"
michael@0 126 /*Set mm0 to 0x0080008000800080.*/
michael@0 127 "psrlw $8,%%mm0\n\t"
michael@0 128 /*#2 Load low residue.*/
michael@0 129 "movq 4*8(%[residue]),%%mm5\n\t"
michael@0 130 /*#2 Load high residue.*/
michael@0 131 "movq 5*8(%[residue]),%%mm6\n\t"
michael@0 132 /*#0 Bias low residue.*/
michael@0 133 "paddsw %%mm0,%%mm1\n\t"
michael@0 134 /*#0 Bias high residue.*/
michael@0 135 "paddsw %%mm0,%%mm2\n\t"
michael@0 136 /*#0 Pack to byte.*/
michael@0 137 "packuswb %%mm2,%%mm1\n\t"
michael@0 138 /*#1 Bias low residue.*/
michael@0 139 "paddsw %%mm0,%%mm3\n\t"
michael@0 140 /*#1 Bias high residue.*/
michael@0 141 "paddsw %%mm0,%%mm4\n\t"
michael@0 142 /*#1 Pack to byte.*/
michael@0 143 "packuswb %%mm4,%%mm3\n\t"
michael@0 144 /*#2 Bias low residue.*/
michael@0 145 "paddsw %%mm0,%%mm5\n\t"
michael@0 146 /*#2 Bias high residue.*/
michael@0 147 "paddsw %%mm0,%%mm6\n\t"
michael@0 148 /*#2 Pack to byte.*/
michael@0 149 "packuswb %%mm6,%%mm5\n\t"
michael@0 150 /*#0 Write row.*/
michael@0 151 "movq %%mm1,(%[dst])\n\t"
michael@0 152 /*#1 Write row.*/
michael@0 153 "movq %%mm3,(%[dst],%[ystride])\n\t"
michael@0 154 /*#2 Write row.*/
michael@0 155 "movq %%mm5,(%[dst],%[ystride],2)\n\t"
michael@0 156 /*#3 Load low residue.*/
michael@0 157 "movq 6*8(%[residue]),%%mm1\n\t"
michael@0 158 /*#3 Load high residue.*/
michael@0 159 "movq 7*8(%[residue]),%%mm2\n\t"
michael@0 160 /*#4 Load high residue.*/
michael@0 161 "movq 8*8(%[residue]),%%mm3\n\t"
michael@0 162 /*#4 Load high residue.*/
michael@0 163 "movq 9*8(%[residue]),%%mm4\n\t"
michael@0 164 /*#5 Load high residue.*/
michael@0 165 "movq 10*8(%[residue]),%%mm5\n\t"
michael@0 166 /*#5 Load high residue.*/
michael@0 167 "movq 11*8(%[residue]),%%mm6\n\t"
michael@0 168 /*#3 Bias low residue.*/
michael@0 169 "paddsw %%mm0,%%mm1\n\t"
michael@0 170 /*#3 Bias high residue.*/
michael@0 171 "paddsw %%mm0,%%mm2\n\t"
michael@0 172 /*#3 Pack to byte.*/
michael@0 173 "packuswb %%mm2,%%mm1\n\t"
michael@0 174 /*#4 Bias low residue.*/
michael@0 175 "paddsw %%mm0,%%mm3\n\t"
michael@0 176 /*#4 Bias high residue.*/
michael@0 177 "paddsw %%mm0,%%mm4\n\t"
michael@0 178 /*#4 Pack to byte.*/
michael@0 179 "packuswb %%mm4,%%mm3\n\t"
michael@0 180 /*#5 Bias low residue.*/
michael@0 181 "paddsw %%mm0,%%mm5\n\t"
michael@0 182 /*#5 Bias high residue.*/
michael@0 183 "paddsw %%mm0,%%mm6\n\t"
michael@0 184 /*#5 Pack to byte.*/
michael@0 185 "packuswb %%mm6,%%mm5\n\t"
michael@0 186 /*#3 Write row.*/
michael@0 187 "movq %%mm1,(%[dst],%[ystride3])\n\t"
michael@0 188 /*#4 Write row.*/
michael@0 189 "movq %%mm3,(%[dst4])\n\t"
michael@0 190 /*#5 Write row.*/
michael@0 191 "movq %%mm5,(%[dst4],%[ystride])\n\t"
michael@0 192 /*#6 Load low residue.*/
michael@0 193 "movq 12*8(%[residue]),%%mm1\n\t"
michael@0 194 /*#6 Load high residue.*/
michael@0 195 "movq 13*8(%[residue]),%%mm2\n\t"
michael@0 196 /*#7 Load low residue.*/
michael@0 197 "movq 14*8(%[residue]),%%mm3\n\t"
michael@0 198 /*#7 Load high residue.*/
michael@0 199 "movq 15*8(%[residue]),%%mm4\n\t"
michael@0 200 /*#6 Bias low residue.*/
michael@0 201 "paddsw %%mm0,%%mm1\n\t"
michael@0 202 /*#6 Bias high residue.*/
michael@0 203 "paddsw %%mm0,%%mm2\n\t"
michael@0 204 /*#6 Pack to byte.*/
michael@0 205 "packuswb %%mm2,%%mm1\n\t"
michael@0 206 /*#7 Bias low residue.*/
michael@0 207 "paddsw %%mm0,%%mm3\n\t"
michael@0 208 /*#7 Bias high residue.*/
michael@0 209 "paddsw %%mm0,%%mm4\n\t"
michael@0 210 /*#7 Pack to byte.*/
michael@0 211 "packuswb %%mm4,%%mm3\n\t"
michael@0 212 /*#6 Write row.*/
michael@0 213 "movq %%mm1,(%[dst4],%[ystride],2)\n\t"
michael@0 214 /*#7 Write row.*/
michael@0 215 "movq %%mm3,(%[dst4],%[ystride3])\n\t"
michael@0 216 :
michael@0 217 :[residue]"r"(_residue),
michael@0 218 [dst]"r"(_dst),
michael@0 219 [dst4]"r"(_dst+(_ystride<<2)),
michael@0 220 [ystride]"r"((ptrdiff_t)_ystride),
michael@0 221 [ystride3]"r"((ptrdiff_t)_ystride*3)
michael@0 222 :"memory"
michael@0 223 );
michael@0 224 }
michael@0 225
michael@0 226 void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
michael@0 227 int _ystride,const ogg_int16_t *_residue){
michael@0 228 int i;
michael@0 229 /*Zero mm0.*/
michael@0 230 __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
michael@0 231 for(i=4;i-->0;){
michael@0 232 __asm__ __volatile__(
michael@0 233 /*#0 Load source.*/
michael@0 234 "movq (%[src]),%%mm3\n\t"
michael@0 235 /*#1 Load source.*/
michael@0 236 "movq (%[src],%[ystride]),%%mm7\n\t"
michael@0 237 /*#0 Get copy of src.*/
michael@0 238 "movq %%mm3,%%mm4\n\t"
michael@0 239 /*#0 Expand high source.*/
michael@0 240 "punpckhbw %%mm0,%%mm4\n\t"
michael@0 241 /*#0 Expand low source.*/
michael@0 242 "punpcklbw %%mm0,%%mm3\n\t"
michael@0 243 /*#0 Add residue high.*/
michael@0 244 "paddsw 8(%[residue]),%%mm4\n\t"
michael@0 245 /*#1 Get copy of src.*/
michael@0 246 "movq %%mm7,%%mm2\n\t"
michael@0 247 /*#0 Add residue low.*/
michael@0 248 "paddsw (%[residue]), %%mm3\n\t"
michael@0 249 /*#1 Expand high source.*/
michael@0 250 "punpckhbw %%mm0,%%mm2\n\t"
michael@0 251 /*#0 Pack final row pixels.*/
michael@0 252 "packuswb %%mm4,%%mm3\n\t"
michael@0 253 /*#1 Expand low source.*/
michael@0 254 "punpcklbw %%mm0,%%mm7\n\t"
michael@0 255 /*#1 Add residue low.*/
michael@0 256 "paddsw 16(%[residue]),%%mm7\n\t"
michael@0 257 /*#1 Add residue high.*/
michael@0 258 "paddsw 24(%[residue]),%%mm2\n\t"
michael@0 259 /*Advance residue.*/
michael@0 260 "lea 32(%[residue]),%[residue]\n\t"
michael@0 261 /*#1 Pack final row pixels.*/
michael@0 262 "packuswb %%mm2,%%mm7\n\t"
michael@0 263 /*Advance src.*/
michael@0 264 "lea (%[src],%[ystride],2),%[src]\n\t"
michael@0 265 /*#0 Write row.*/
michael@0 266 "movq %%mm3,(%[dst])\n\t"
michael@0 267 /*#1 Write row.*/
michael@0 268 "movq %%mm7,(%[dst],%[ystride])\n\t"
michael@0 269 /*Advance dst.*/
michael@0 270 "lea (%[dst],%[ystride],2),%[dst]\n\t"
michael@0 271 :[residue]"+r"(_residue),[dst]"+r"(_dst),[src]"+r"(_src)
michael@0 272 :[ystride]"r"((ptrdiff_t)_ystride)
michael@0 273 :"memory"
michael@0 274 );
michael@0 275 }
michael@0 276 }
michael@0 277
michael@0 278 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
michael@0 279 const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
michael@0 280 int i;
michael@0 281 /*Zero mm7.*/
michael@0 282 __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
michael@0 283 for(i=4;i-->0;){
michael@0 284 __asm__ __volatile__(
michael@0 285 /*#0 Load src1.*/
michael@0 286 "movq (%[src1]),%%mm0\n\t"
michael@0 287 /*#0 Load src2.*/
michael@0 288 "movq (%[src2]),%%mm2\n\t"
michael@0 289 /*#0 Copy src1.*/
michael@0 290 "movq %%mm0,%%mm1\n\t"
michael@0 291 /*#0 Copy src2.*/
michael@0 292 "movq %%mm2,%%mm3\n\t"
michael@0 293 /*#1 Load src1.*/
michael@0 294 "movq (%[src1],%[ystride]),%%mm4\n\t"
michael@0 295 /*#0 Unpack lower src1.*/
michael@0 296 "punpcklbw %%mm7,%%mm0\n\t"
michael@0 297 /*#1 Load src2.*/
michael@0 298 "movq (%[src2],%[ystride]),%%mm5\n\t"
michael@0 299 /*#0 Unpack higher src1.*/
michael@0 300 "punpckhbw %%mm7,%%mm1\n\t"
michael@0 301 /*#0 Unpack lower src2.*/
michael@0 302 "punpcklbw %%mm7,%%mm2\n\t"
michael@0 303 /*#0 Unpack higher src2.*/
michael@0 304 "punpckhbw %%mm7,%%mm3\n\t"
michael@0 305 /*Advance src1 ptr.*/
michael@0 306 "lea (%[src1],%[ystride],2),%[src1]\n\t"
michael@0 307 /*Advance src2 ptr.*/
michael@0 308 "lea (%[src2],%[ystride],2),%[src2]\n\t"
michael@0 309 /*#0 Lower src1+src2.*/
michael@0 310 "paddsw %%mm2,%%mm0\n\t"
michael@0 311 /*#0 Higher src1+src2.*/
michael@0 312 "paddsw %%mm3,%%mm1\n\t"
michael@0 313 /*#1 Copy src1.*/
michael@0 314 "movq %%mm4,%%mm2\n\t"
michael@0 315 /*#0 Build lo average.*/
michael@0 316 "psraw $1,%%mm0\n\t"
michael@0 317 /*#1 Copy src2.*/
michael@0 318 "movq %%mm5,%%mm3\n\t"
michael@0 319 /*#1 Unpack lower src1.*/
michael@0 320 "punpcklbw %%mm7,%%mm4\n\t"
michael@0 321 /*#0 Build hi average.*/
michael@0 322 "psraw $1,%%mm1\n\t"
michael@0 323 /*#1 Unpack higher src1.*/
michael@0 324 "punpckhbw %%mm7,%%mm2\n\t"
michael@0 325 /*#0 low+=residue.*/
michael@0 326 "paddsw (%[residue]),%%mm0\n\t"
michael@0 327 /*#1 Unpack lower src2.*/
michael@0 328 "punpcklbw %%mm7,%%mm5\n\t"
michael@0 329 /*#0 high+=residue.*/
michael@0 330 "paddsw 8(%[residue]),%%mm1\n\t"
michael@0 331 /*#1 Unpack higher src2.*/
michael@0 332 "punpckhbw %%mm7,%%mm3\n\t"
michael@0 333 /*#1 Lower src1+src2.*/
michael@0 334 "paddsw %%mm4,%%mm5\n\t"
michael@0 335 /*#0 Pack and saturate.*/
michael@0 336 "packuswb %%mm1,%%mm0\n\t"
michael@0 337 /*#1 Higher src1+src2.*/
michael@0 338 "paddsw %%mm2,%%mm3\n\t"
michael@0 339 /*#0 Write row.*/
michael@0 340 "movq %%mm0,(%[dst])\n\t"
michael@0 341 /*#1 Build lo average.*/
michael@0 342 "psraw $1,%%mm5\n\t"
michael@0 343 /*#1 Build hi average.*/
michael@0 344 "psraw $1,%%mm3\n\t"
michael@0 345 /*#1 low+=residue.*/
michael@0 346 "paddsw 16(%[residue]),%%mm5\n\t"
michael@0 347 /*#1 high+=residue.*/
michael@0 348 "paddsw 24(%[residue]),%%mm3\n\t"
michael@0 349 /*#1 Pack and saturate.*/
michael@0 350 "packuswb %%mm3,%%mm5\n\t"
michael@0 351 /*#1 Write row ptr.*/
michael@0 352 "movq %%mm5,(%[dst],%[ystride])\n\t"
michael@0 353 /*Advance residue ptr.*/
michael@0 354 "add $32,%[residue]\n\t"
michael@0 355 /*Advance dest ptr.*/
michael@0 356 "lea (%[dst],%[ystride],2),%[dst]\n\t"
michael@0 357 :[dst]"+r"(_dst),[residue]"+r"(_residue),
michael@0 358 [src1]"+%r"(_src1),[src2]"+r"(_src2)
michael@0 359 :[ystride]"r"((ptrdiff_t)_ystride)
michael@0 360 :"memory"
michael@0 361 );
michael@0 362 }
michael@0 363 }
michael@0 364
michael@0 365 void oc_restore_fpu_mmx(void){
michael@0 366 __asm__ __volatile__("emms\n\t");
michael@0 367 }
michael@0 368 #endif

mercurial