media/libtheora/lib/x86_vc/mmxfrag.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /********************************************************************
michael@0 2 * *
michael@0 3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
michael@0 4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
michael@0 5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
michael@0 6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
michael@0 7 * *
michael@0 8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
michael@0 9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
michael@0 10 * *
michael@0 11 ********************************************************************
michael@0 12
michael@0 13 function:
michael@0 14 last mod: $Id: mmxfrag.c 17446 2010-09-23 20:06:20Z tterribe $
michael@0 15
michael@0 16 ********************************************************************/
michael@0 17
michael@0 18 /*MMX acceleration of fragment reconstruction for motion compensation.
michael@0 19 Originally written by Rudolf Marek.
michael@0 20 Additional optimization by Nils Pipenbrinck.
michael@0 21 Note: Loops are unrolled for best performance.
michael@0 22 The iteration each instruction belongs to is marked in the comments as #i.*/
michael@0 23 #include <stddef.h>
michael@0 24 #include "x86int.h"
michael@0 25
michael@0 26 #if defined(OC_X86_ASM)
michael@0 27
michael@0 28 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
michael@0 29 between rows.*/
michael@0 30 # define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
michael@0 31 do{ \
michael@0 32 const unsigned char *src; \
michael@0 33 unsigned char *dst; \
michael@0 34 src=(_src); \
michael@0 35 dst=(_dst); \
michael@0 36 __asm mov SRC,src \
michael@0 37 __asm mov DST,dst \
michael@0 38 __asm mov YSTRIDE,_ystride \
michael@0 39 /*src+0*ystride*/ \
michael@0 40 __asm movq mm0,[SRC] \
michael@0 41 /*src+1*ystride*/ \
michael@0 42 __asm movq mm1,[SRC+YSTRIDE] \
michael@0 43 /*ystride3=ystride*3*/ \
michael@0 44 __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
michael@0 45 /*src+2*ystride*/ \
michael@0 46 __asm movq mm2,[SRC+YSTRIDE*2] \
michael@0 47 /*src+3*ystride*/ \
michael@0 48 __asm movq mm3,[SRC+YSTRIDE3] \
michael@0 49 /*dst+0*ystride*/ \
michael@0 50 __asm movq [DST],mm0 \
michael@0 51 /*dst+1*ystride*/ \
michael@0 52 __asm movq [DST+YSTRIDE],mm1 \
michael@0 53 /*Pointer to next 4.*/ \
michael@0 54 __asm lea SRC,[SRC+YSTRIDE*4] \
michael@0 55 /*dst+2*ystride*/ \
michael@0 56 __asm movq [DST+YSTRIDE*2],mm2 \
michael@0 57 /*dst+3*ystride*/ \
michael@0 58 __asm movq [DST+YSTRIDE3],mm3 \
michael@0 59 /*Pointer to next 4.*/ \
michael@0 60 __asm lea DST,[DST+YSTRIDE*4] \
michael@0 61 /*src+0*ystride*/ \
michael@0 62 __asm movq mm0,[SRC] \
michael@0 63 /*src+1*ystride*/ \
michael@0 64 __asm movq mm1,[SRC+YSTRIDE] \
michael@0 65 /*src+2*ystride*/ \
michael@0 66 __asm movq mm2,[SRC+YSTRIDE*2] \
michael@0 67 /*src+3*ystride*/ \
michael@0 68 __asm movq mm3,[SRC+YSTRIDE3] \
michael@0 69 /*dst+0*ystride*/ \
michael@0 70 __asm movq [DST],mm0 \
michael@0 71 /*dst+1*ystride*/ \
michael@0 72 __asm movq [DST+YSTRIDE],mm1 \
michael@0 73 /*dst+2*ystride*/ \
michael@0 74 __asm movq [DST+YSTRIDE*2],mm2 \
michael@0 75 /*dst+3*ystride*/ \
michael@0 76 __asm movq [DST+YSTRIDE3],mm3 \
michael@0 77 } \
michael@0 78 while(0)
michael@0 79
michael@0 80 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
michael@0 81 between rows.*/
michael@0 82 void oc_frag_copy_mmx(unsigned char *_dst,
michael@0 83 const unsigned char *_src,int _ystride){
michael@0 84 #define SRC edx
michael@0 85 #define DST eax
michael@0 86 #define YSTRIDE ecx
michael@0 87 #define YSTRIDE3 esi
michael@0 88 OC_FRAG_COPY_MMX(_dst,_src,_ystride);
michael@0 89 #undef SRC
michael@0 90 #undef DST
michael@0 91 #undef YSTRIDE
michael@0 92 #undef YSTRIDE3
michael@0 93 }
michael@0 94
michael@0 95 /*Copies the fragments specified by the lists of fragment indices from one
michael@0 96 frame to another.
michael@0 97 _dst_frame: The reference frame to copy to.
michael@0 98 _src_frame: The reference frame to copy from.
michael@0 99 _ystride: The row stride of the reference frames.
michael@0 100 _fragis: A pointer to a list of fragment indices.
michael@0 101 _nfragis: The number of fragment indices to copy.
michael@0 102 _frag_buf_offs: The offsets of fragments in the reference frames.*/
michael@0 103 void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
michael@0 104 const unsigned char *_src_frame,int _ystride,
michael@0 105 const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
michael@0 106 ptrdiff_t fragii;
michael@0 107 for(fragii=0;fragii<_nfragis;fragii++){
michael@0 108 ptrdiff_t frag_buf_off;
michael@0 109 frag_buf_off=_frag_buf_offs[_fragis[fragii]];
michael@0 110 #define SRC edx
michael@0 111 #define DST eax
michael@0 112 #define YSTRIDE ecx
michael@0 113 #define YSTRIDE3 edi
michael@0 114 OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
michael@0 115 _src_frame+frag_buf_off,_ystride);
michael@0 116 #undef SRC
michael@0 117 #undef DST
michael@0 118 #undef YSTRIDE
michael@0 119 #undef YSTRIDE3
michael@0 120 }
michael@0 121 }
michael@0 122
michael@0 123 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
michael@0 124 const ogg_int16_t *_residue){
michael@0 125 __asm{
michael@0 126 #define DST edx
michael@0 127 #define DST4 esi
michael@0 128 #define YSTRIDE eax
michael@0 129 #define YSTRIDE3 edi
michael@0 130 #define RESIDUE ecx
michael@0 131 mov DST,_dst
michael@0 132 mov YSTRIDE,_ystride
michael@0 133 mov RESIDUE,_residue
michael@0 134 lea DST4,[DST+YSTRIDE*4]
michael@0 135 lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
michael@0 136 /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
michael@0 137 pcmpeqw mm0,mm0
michael@0 138 /*#0 Load low residue.*/
michael@0 139 movq mm1,[0*8+RESIDUE]
michael@0 140 /*#0 Load high residue.*/
michael@0 141 movq mm2,[1*8+RESIDUE]
michael@0 142 /*Set mm0 to 0x8000800080008000.*/
michael@0 143 psllw mm0,15
michael@0 144 /*#1 Load low residue.*/
michael@0 145 movq mm3,[2*8+RESIDUE]
michael@0 146 /*#1 Load high residue.*/
michael@0 147 movq mm4,[3*8+RESIDUE]
michael@0 148 /*Set mm0 to 0x0080008000800080.*/
michael@0 149 psrlw mm0,8
michael@0 150 /*#2 Load low residue.*/
michael@0 151 movq mm5,[4*8+RESIDUE]
michael@0 152 /*#2 Load high residue.*/
michael@0 153 movq mm6,[5*8+RESIDUE]
michael@0 154 /*#0 Bias low residue.*/
michael@0 155 paddsw mm1,mm0
michael@0 156 /*#0 Bias high residue.*/
michael@0 157 paddsw mm2,mm0
michael@0 158 /*#0 Pack to byte.*/
michael@0 159 packuswb mm1,mm2
michael@0 160 /*#1 Bias low residue.*/
michael@0 161 paddsw mm3,mm0
michael@0 162 /*#1 Bias high residue.*/
michael@0 163 paddsw mm4,mm0
michael@0 164 /*#1 Pack to byte.*/
michael@0 165 packuswb mm3,mm4
michael@0 166 /*#2 Bias low residue.*/
michael@0 167 paddsw mm5,mm0
michael@0 168 /*#2 Bias high residue.*/
michael@0 169 paddsw mm6,mm0
michael@0 170 /*#2 Pack to byte.*/
michael@0 171 packuswb mm5,mm6
michael@0 172 /*#0 Write row.*/
michael@0 173 movq [DST],mm1
michael@0 174 /*#1 Write row.*/
michael@0 175 movq [DST+YSTRIDE],mm3
michael@0 176 /*#2 Write row.*/
michael@0 177 movq [DST+YSTRIDE*2],mm5
michael@0 178 /*#3 Load low residue.*/
michael@0 179 movq mm1,[6*8+RESIDUE]
michael@0 180 /*#3 Load high residue.*/
michael@0 181 movq mm2,[7*8+RESIDUE]
michael@0 182 /*#4 Load high residue.*/
michael@0 183 movq mm3,[8*8+RESIDUE]
michael@0 184 /*#4 Load high residue.*/
michael@0 185 movq mm4,[9*8+RESIDUE]
michael@0 186 /*#5 Load high residue.*/
michael@0 187 movq mm5,[10*8+RESIDUE]
michael@0 188 /*#5 Load high residue.*/
michael@0 189 movq mm6,[11*8+RESIDUE]
michael@0 190 /*#3 Bias low residue.*/
michael@0 191 paddsw mm1,mm0
michael@0 192 /*#3 Bias high residue.*/
michael@0 193 paddsw mm2,mm0
michael@0 194 /*#3 Pack to byte.*/
michael@0 195 packuswb mm1,mm2
michael@0 196 /*#4 Bias low residue.*/
michael@0 197 paddsw mm3,mm0
michael@0 198 /*#4 Bias high residue.*/
michael@0 199 paddsw mm4,mm0
michael@0 200 /*#4 Pack to byte.*/
michael@0 201 packuswb mm3,mm4
michael@0 202 /*#5 Bias low residue.*/
michael@0 203 paddsw mm5,mm0
michael@0 204 /*#5 Bias high residue.*/
michael@0 205 paddsw mm6,mm0
michael@0 206 /*#5 Pack to byte.*/
michael@0 207 packuswb mm5,mm6
michael@0 208 /*#3 Write row.*/
michael@0 209 movq [DST+YSTRIDE3],mm1
michael@0 210 /*#4 Write row.*/
michael@0 211 movq [DST4],mm3
michael@0 212 /*#5 Write row.*/
michael@0 213 movq [DST4+YSTRIDE],mm5
michael@0 214 /*#6 Load low residue.*/
michael@0 215 movq mm1,[12*8+RESIDUE]
michael@0 216 /*#6 Load high residue.*/
michael@0 217 movq mm2,[13*8+RESIDUE]
michael@0 218 /*#7 Load low residue.*/
michael@0 219 movq mm3,[14*8+RESIDUE]
michael@0 220 /*#7 Load high residue.*/
michael@0 221 movq mm4,[15*8+RESIDUE]
michael@0 222 /*#6 Bias low residue.*/
michael@0 223 paddsw mm1,mm0
michael@0 224 /*#6 Bias high residue.*/
michael@0 225 paddsw mm2,mm0
michael@0 226 /*#6 Pack to byte.*/
michael@0 227 packuswb mm1,mm2
michael@0 228 /*#7 Bias low residue.*/
michael@0 229 paddsw mm3,mm0
michael@0 230 /*#7 Bias high residue.*/
michael@0 231 paddsw mm4,mm0
michael@0 232 /*#7 Pack to byte.*/
michael@0 233 packuswb mm3,mm4
michael@0 234 /*#6 Write row.*/
michael@0 235 movq [DST4+YSTRIDE*2],mm1
michael@0 236 /*#7 Write row.*/
michael@0 237 movq [DST4+YSTRIDE3],mm3
michael@0 238 #undef DST
michael@0 239 #undef DST4
michael@0 240 #undef YSTRIDE
michael@0 241 #undef YSTRIDE3
michael@0 242 #undef RESIDUE
michael@0 243 }
michael@0 244 }
michael@0 245
michael@0 246 void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
michael@0 247 int _ystride,const ogg_int16_t *_residue){
michael@0 248 int i;
michael@0 249 /*Zero mm0.*/
michael@0 250 __asm pxor mm0,mm0;
michael@0 251 for(i=4;i-->0;){
michael@0 252 __asm{
michael@0 253 #define DST edx
michael@0 254 #define SRC ecx
michael@0 255 #define YSTRIDE edi
michael@0 256 #define RESIDUE eax
michael@0 257 mov DST,_dst
michael@0 258 mov SRC,_src
michael@0 259 mov YSTRIDE,_ystride
michael@0 260 mov RESIDUE,_residue
michael@0 261 /*#0 Load source.*/
michael@0 262 movq mm3,[SRC]
michael@0 263 /*#1 Load source.*/
michael@0 264 movq mm7,[SRC+YSTRIDE]
michael@0 265 /*#0 Get copy of src.*/
michael@0 266 movq mm4,mm3
michael@0 267 /*#0 Expand high source.*/
michael@0 268 punpckhbw mm4,mm0
michael@0 269 /*#0 Expand low source.*/
michael@0 270 punpcklbw mm3,mm0
michael@0 271 /*#0 Add residue high.*/
michael@0 272 paddsw mm4,[8+RESIDUE]
michael@0 273 /*#1 Get copy of src.*/
michael@0 274 movq mm2,mm7
michael@0 275 /*#0 Add residue low.*/
michael@0 276 paddsw mm3,[RESIDUE]
michael@0 277 /*#1 Expand high source.*/
michael@0 278 punpckhbw mm2,mm0
michael@0 279 /*#0 Pack final row pixels.*/
michael@0 280 packuswb mm3,mm4
michael@0 281 /*#1 Expand low source.*/
michael@0 282 punpcklbw mm7,mm0
michael@0 283 /*#1 Add residue low.*/
michael@0 284 paddsw mm7,[16+RESIDUE]
michael@0 285 /*#1 Add residue high.*/
michael@0 286 paddsw mm2,[24+RESIDUE]
michael@0 287 /*Advance residue.*/
michael@0 288 lea RESIDUE,[32+RESIDUE]
michael@0 289 /*#1 Pack final row pixels.*/
michael@0 290 packuswb mm7,mm2
michael@0 291 /*Advance src.*/
michael@0 292 lea SRC,[SRC+YSTRIDE*2]
michael@0 293 /*#0 Write row.*/
michael@0 294 movq [DST],mm3
michael@0 295 /*#1 Write row.*/
michael@0 296 movq [DST+YSTRIDE],mm7
michael@0 297 /*Advance dst.*/
michael@0 298 lea DST,[DST+YSTRIDE*2]
michael@0 299 mov _residue,RESIDUE
michael@0 300 mov _dst,DST
michael@0 301 mov _src,SRC
michael@0 302 #undef DST
michael@0 303 #undef SRC
michael@0 304 #undef YSTRIDE
michael@0 305 #undef RESIDUE
michael@0 306 }
michael@0 307 }
michael@0 308 }
michael@0 309
michael@0 310 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
michael@0 311 const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
michael@0 312 int i;
michael@0 313 /*Zero mm7.*/
michael@0 314 __asm pxor mm7,mm7;
michael@0 315 for(i=4;i-->0;){
michael@0 316 __asm{
michael@0 317 #define SRC1 ecx
michael@0 318 #define SRC2 edi
michael@0 319 #define YSTRIDE esi
michael@0 320 #define RESIDUE edx
michael@0 321 #define DST eax
michael@0 322 mov YSTRIDE,_ystride
michael@0 323 mov DST,_dst
michael@0 324 mov RESIDUE,_residue
michael@0 325 mov SRC1,_src1
michael@0 326 mov SRC2,_src2
michael@0 327 /*#0 Load src1.*/
michael@0 328 movq mm0,[SRC1]
michael@0 329 /*#0 Load src2.*/
michael@0 330 movq mm2,[SRC2]
michael@0 331 /*#0 Copy src1.*/
michael@0 332 movq mm1,mm0
michael@0 333 /*#0 Copy src2.*/
michael@0 334 movq mm3,mm2
michael@0 335 /*#1 Load src1.*/
michael@0 336 movq mm4,[SRC1+YSTRIDE]
michael@0 337 /*#0 Unpack lower src1.*/
michael@0 338 punpcklbw mm0,mm7
michael@0 339 /*#1 Load src2.*/
michael@0 340 movq mm5,[SRC2+YSTRIDE]
michael@0 341 /*#0 Unpack higher src1.*/
michael@0 342 punpckhbw mm1,mm7
michael@0 343 /*#0 Unpack lower src2.*/
michael@0 344 punpcklbw mm2,mm7
michael@0 345 /*#0 Unpack higher src2.*/
michael@0 346 punpckhbw mm3,mm7
michael@0 347 /*Advance src1 ptr.*/
michael@0 348 lea SRC1,[SRC1+YSTRIDE*2]
michael@0 349 /*Advance src2 ptr.*/
michael@0 350 lea SRC2,[SRC2+YSTRIDE*2]
michael@0 351 /*#0 Lower src1+src2.*/
michael@0 352 paddsw mm0,mm2
michael@0 353 /*#0 Higher src1+src2.*/
michael@0 354 paddsw mm1,mm3
michael@0 355 /*#1 Copy src1.*/
michael@0 356 movq mm2,mm4
michael@0 357 /*#0 Build lo average.*/
michael@0 358 psraw mm0,1
michael@0 359 /*#1 Copy src2.*/
michael@0 360 movq mm3,mm5
michael@0 361 /*#1 Unpack lower src1.*/
michael@0 362 punpcklbw mm4,mm7
michael@0 363 /*#0 Build hi average.*/
michael@0 364 psraw mm1,1
michael@0 365 /*#1 Unpack higher src1.*/
michael@0 366 punpckhbw mm2,mm7
michael@0 367 /*#0 low+=residue.*/
michael@0 368 paddsw mm0,[RESIDUE]
michael@0 369 /*#1 Unpack lower src2.*/
michael@0 370 punpcklbw mm5,mm7
michael@0 371 /*#0 high+=residue.*/
michael@0 372 paddsw mm1,[8+RESIDUE]
michael@0 373 /*#1 Unpack higher src2.*/
michael@0 374 punpckhbw mm3,mm7
michael@0 375 /*#1 Lower src1+src2.*/
michael@0 376 paddsw mm5,mm4
michael@0 377 /*#0 Pack and saturate.*/
michael@0 378 packuswb mm0,mm1
michael@0 379 /*#1 Higher src1+src2.*/
michael@0 380 paddsw mm3,mm2
michael@0 381 /*#0 Write row.*/
michael@0 382 movq [DST],mm0
michael@0 383 /*#1 Build lo average.*/
michael@0 384 psraw mm5,1
michael@0 385 /*#1 Build hi average.*/
michael@0 386 psraw mm3,1
michael@0 387 /*#1 low+=residue.*/
michael@0 388 paddsw mm5,[16+RESIDUE]
michael@0 389 /*#1 high+=residue.*/
michael@0 390 paddsw mm3,[24+RESIDUE]
michael@0 391 /*#1 Pack and saturate.*/
michael@0 392 packuswb mm5,mm3
michael@0 393 /*#1 Write row ptr.*/
michael@0 394 movq [DST+YSTRIDE],mm5
michael@0 395 /*Advance residue ptr.*/
michael@0 396 add RESIDUE,32
michael@0 397 /*Advance dest ptr.*/
michael@0 398 lea DST,[DST+YSTRIDE*2]
michael@0 399 mov _dst,DST
michael@0 400 mov _residue,RESIDUE
michael@0 401 mov _src1,SRC1
michael@0 402 mov _src2,SRC2
michael@0 403 #undef SRC1
michael@0 404 #undef SRC2
michael@0 405 #undef YSTRIDE
michael@0 406 #undef RESIDUE
michael@0 407 #undef DST
michael@0 408 }
michael@0 409 }
michael@0 410 }
michael@0 411
michael@0 412 void oc_restore_fpu_mmx(void){
michael@0 413 __asm emms;
michael@0 414 }
michael@0 415
michael@0 416 #endif

mercurial