Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | #if !defined(_x86_vc_mmxloop_H) |
michael@0 | 2 | # define _x86_vc_mmxloop_H (1) |
michael@0 | 3 | # include <stddef.h> |
michael@0 | 4 | # include "x86int.h" |
michael@0 | 5 | |
michael@0 | 6 | #if defined(OC_X86_ASM) |
michael@0 | 7 | |
michael@0 | 8 | /*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}. |
michael@0 | 9 | On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and |
michael@0 | 10 | mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/ |
michael@0 | 11 | #define OC_LOOP_FILTER8_MMX __asm{ \ |
michael@0 | 12 | /*mm7=0*/ \ |
michael@0 | 13 | __asm pxor mm7,mm7 \ |
michael@0 | 14 | /*mm6:mm0={a0,...,a7}*/ \ |
michael@0 | 15 | __asm movq mm6,mm0 \ |
michael@0 | 16 | __asm punpcklbw mm0,mm7 \ |
michael@0 | 17 | __asm punpckhbw mm6,mm7 \ |
michael@0 | 18 | /*mm3:mm5={d0,...,d7}*/ \ |
michael@0 | 19 | __asm movq mm5,mm3 \ |
michael@0 | 20 | __asm punpcklbw mm3,mm7 \ |
michael@0 | 21 | __asm punpckhbw mm5,mm7 \ |
michael@0 | 22 | /*mm6:mm0={a0-d0,...,a7-d7}*/ \ |
michael@0 | 23 | __asm psubw mm0,mm3 \ |
michael@0 | 24 | __asm psubw mm6,mm5 \ |
michael@0 | 25 | /*mm3:mm1={b0,...,b7}*/ \ |
michael@0 | 26 | __asm movq mm3,mm1 \ |
michael@0 | 27 | __asm punpcklbw mm1,mm7 \ |
michael@0 | 28 | __asm movq mm4,mm2 \ |
michael@0 | 29 | __asm punpckhbw mm3,mm7 \ |
michael@0 | 30 | /*mm5:mm4={c0,...,c7}*/ \ |
michael@0 | 31 | __asm movq mm5,mm2 \ |
michael@0 | 32 | __asm punpcklbw mm4,mm7 \ |
michael@0 | 33 | __asm punpckhbw mm5,mm7 \ |
michael@0 | 34 | /*mm7={3}x4 \ |
michael@0 | 35 | mm5:mm4={c0-b0,...,c7-b7}*/ \ |
michael@0 | 36 | __asm pcmpeqw mm7,mm7 \ |
michael@0 | 37 | __asm psubw mm4,mm1 \ |
michael@0 | 38 | __asm psrlw mm7,14 \ |
michael@0 | 39 | __asm psubw mm5,mm3 \ |
michael@0 | 40 | /*Scale by 3.*/ \ |
michael@0 | 41 | __asm pmullw mm4,mm7 \ |
michael@0 | 42 | __asm pmullw mm5,mm7 \ |
michael@0 | 43 | /*mm7={4}x4 \ |
michael@0 | 44 | mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \ |
michael@0 | 45 | __asm psrlw mm7,1 \ |
michael@0 | 46 | __asm paddw mm4,mm0 \ |
michael@0 | 47 | __asm psllw mm7,2 \ |
michael@0 | 48 | __asm movq mm0,[LL] \ |
michael@0 | 49 | __asm paddw mm5,mm6 \ |
michael@0 | 50 | /*R_i has the range [-127,128], so we compute -R_i instead. \ |
michael@0 | 51 | mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \ |
michael@0 | 52 | __asm psubw mm4,mm7 \ |
michael@0 | 53 | __asm psubw mm5,mm7 \ |
michael@0 | 54 | __asm psraw mm4,3 \ |
michael@0 | 55 | __asm psraw mm5,3 \ |
michael@0 | 56 | __asm pcmpeqb mm7,mm7 \ |
michael@0 | 57 | __asm packsswb mm4,mm5 \ |
michael@0 | 58 | __asm pxor mm6,mm6 \ |
michael@0 | 59 | __asm pxor mm4,mm7 \ |
michael@0 | 60 | __asm packuswb mm1,mm3 \ |
michael@0 | 61 | /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \ |
michael@0 | 62 | /*There's no unsigned byte+signed byte with unsigned saturation op code, so \ |
michael@0 | 63 | we have to split things by sign (the other option is to work in 16 bits, \ |
michael@0 | 64 | but working in 8 bits gives much better parallelism). \ |
michael@0 | 65 | We compute abs(R_i), but save a mask of which terms were negative in mm6. \ |
michael@0 | 66 | Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \ |
michael@0 | 67 | Finally, we split mm4 into positive and negative pieces using the mask in \ |
michael@0 | 68 | mm6, and add and subtract them as appropriate.*/ \ |
michael@0 | 69 | /*mm4=abs(-R_i)*/ \ |
michael@0 | 70 | /*mm7=255-2*L*/ \ |
michael@0 | 71 | __asm pcmpgtb mm6,mm4 \ |
michael@0 | 72 | __asm psubb mm7,mm0 \ |
michael@0 | 73 | __asm pxor mm4,mm6 \ |
michael@0 | 74 | __asm psubb mm7,mm0 \ |
michael@0 | 75 | __asm psubb mm4,mm6 \ |
michael@0 | 76 | /*mm7=255-max(2*L-abs(R_i),0)*/ \ |
michael@0 | 77 | __asm paddusb mm7,mm4 \ |
michael@0 | 78 | /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \ |
michael@0 | 79 | __asm paddusb mm4,mm7 \ |
michael@0 | 80 | __asm psubusb mm4,mm7 \ |
michael@0 | 81 | /*Now split mm4 by the original sign of -R_i.*/ \ |
michael@0 | 82 | __asm movq mm5,mm4 \ |
michael@0 | 83 | __asm pand mm4,mm6 \ |
michael@0 | 84 | __asm pandn mm6,mm5 \ |
michael@0 | 85 | /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \ |
michael@0 | 86 | /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \ |
michael@0 | 87 | __asm paddusb mm1,mm4 \ |
michael@0 | 88 | __asm psubusb mm2,mm4 \ |
michael@0 | 89 | __asm psubusb mm1,mm6 \ |
michael@0 | 90 | __asm paddusb mm2,mm6 \ |
michael@0 | 91 | } |
michael@0 | 92 | |
michael@0 | 93 | #define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \ |
michael@0 | 94 | do{ \ |
michael@0 | 95 | /*Used local variable pix__ in order to fix compilation errors like: \ |
michael@0 | 96 | "error C2425: 'SHL' : non-constant expression in 'second operand'".*/ \ |
michael@0 | 97 | unsigned char *pix__; \ |
michael@0 | 98 | unsigned char *ll__; \ |
michael@0 | 99 | ll__=(_ll); \ |
michael@0 | 100 | pix__=(_pix); \ |
michael@0 | 101 | __asm mov YSTRIDE,_ystride \ |
michael@0 | 102 | __asm mov LL,ll__ \ |
michael@0 | 103 | __asm mov PIX,pix__ \ |
michael@0 | 104 | __asm sub PIX,YSTRIDE \ |
michael@0 | 105 | __asm sub PIX,YSTRIDE \ |
michael@0 | 106 | /*mm0={a0,...,a7}*/ \ |
michael@0 | 107 | __asm movq mm0,[PIX] \ |
michael@0 | 108 | /*ystride3=_ystride*3*/ \ |
michael@0 | 109 | __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \ |
michael@0 | 110 | /*mm3={d0,...,d7}*/ \ |
michael@0 | 111 | __asm movq mm3,[PIX+YSTRIDE3] \ |
michael@0 | 112 | /*mm1={b0,...,b7}*/ \ |
michael@0 | 113 | __asm movq mm1,[PIX+YSTRIDE] \ |
michael@0 | 114 | /*mm2={c0,...,c7}*/ \ |
michael@0 | 115 | __asm movq mm2,[PIX+YSTRIDE*2] \ |
michael@0 | 116 | OC_LOOP_FILTER8_MMX \ |
michael@0 | 117 | /*Write it back out.*/ \ |
michael@0 | 118 | __asm movq [PIX+YSTRIDE],mm1 \ |
michael@0 | 119 | __asm movq [PIX+YSTRIDE*2],mm2 \ |
michael@0 | 120 | } \ |
michael@0 | 121 | while(0) |
michael@0 | 122 | |
michael@0 | 123 | #define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \ |
michael@0 | 124 | do{ \ |
michael@0 | 125 | /*Used local variable ll__ in order to fix compilation errors like: \ |
michael@0 | 126 | "error C2443: operand size conflict".*/ \ |
michael@0 | 127 | unsigned char *ll__; \ |
michael@0 | 128 | unsigned char *pix__; \ |
michael@0 | 129 | ll__=(_ll); \ |
michael@0 | 130 | pix__=(_pix)-2; \ |
michael@0 | 131 | __asm mov PIX,pix__ \ |
michael@0 | 132 | __asm mov YSTRIDE,_ystride \ |
michael@0 | 133 | __asm mov LL,ll__ \ |
michael@0 | 134 | /*x x x x d0 c0 b0 a0*/ \ |
michael@0 | 135 | __asm movd mm0,[PIX] \ |
michael@0 | 136 | /*x x x x d1 c1 b1 a1*/ \ |
michael@0 | 137 | __asm movd mm1,[PIX+YSTRIDE] \ |
michael@0 | 138 | /*ystride3=_ystride*3*/ \ |
michael@0 | 139 | __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \ |
michael@0 | 140 | /*x x x x d2 c2 b2 a2*/ \ |
michael@0 | 141 | __asm movd mm2,[PIX+YSTRIDE*2] \ |
michael@0 | 142 | /*x x x x d3 c3 b3 a3*/ \ |
michael@0 | 143 | __asm lea D,[PIX+YSTRIDE*4] \ |
michael@0 | 144 | __asm movd mm3,[PIX+YSTRIDE3] \ |
michael@0 | 145 | /*x x x x d4 c4 b4 a4*/ \ |
michael@0 | 146 | __asm movd mm4,[D] \ |
michael@0 | 147 | /*x x x x d5 c5 b5 a5*/ \ |
michael@0 | 148 | __asm movd mm5,[D+YSTRIDE] \ |
michael@0 | 149 | /*x x x x d6 c6 b6 a6*/ \ |
michael@0 | 150 | __asm movd mm6,[D+YSTRIDE*2] \ |
michael@0 | 151 | /*x x x x d7 c7 b7 a7*/ \ |
michael@0 | 152 | __asm movd mm7,[D+YSTRIDE3] \ |
michael@0 | 153 | /*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \ |
michael@0 | 154 | __asm punpcklbw mm0,mm1 \ |
michael@0 | 155 | /*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \ |
michael@0 | 156 | __asm punpcklbw mm2,mm3 \ |
michael@0 | 157 | /*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \ |
michael@0 | 158 | __asm movq mm3,mm0 \ |
michael@0 | 159 | /*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \ |
michael@0 | 160 | __asm punpcklwd mm0,mm2 \ |
michael@0 | 161 | /*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \ |
michael@0 | 162 | __asm punpckhwd mm3,mm2 \ |
michael@0 | 163 | /*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \ |
michael@0 | 164 | __asm movq mm1,mm0 \ |
michael@0 | 165 | /*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \ |
michael@0 | 166 | __asm punpcklbw mm4,mm5 \ |
michael@0 | 167 | /*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \ |
michael@0 | 168 | __asm punpcklbw mm6,mm7 \ |
michael@0 | 169 | /*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \ |
michael@0 | 170 | __asm movq mm5,mm4 \ |
michael@0 | 171 | /*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \ |
michael@0 | 172 | __asm punpcklwd mm4,mm6 \ |
michael@0 | 173 | /*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \ |
michael@0 | 174 | __asm punpckhwd mm5,mm6 \ |
michael@0 | 175 | /*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \ |
michael@0 | 176 | __asm movq mm2,mm3 \ |
michael@0 | 177 | /*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \ |
michael@0 | 178 | __asm punpckldq mm0,mm4 \ |
michael@0 | 179 | /*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \ |
michael@0 | 180 | __asm punpckhdq mm1,mm4 \ |
michael@0 | 181 | /*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \ |
michael@0 | 182 | __asm punpckldq mm2,mm5 \ |
michael@0 | 183 | /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \ |
michael@0 | 184 | __asm punpckhdq mm3,mm5 \ |
michael@0 | 185 | OC_LOOP_FILTER8_MMX \ |
michael@0 | 186 | /*mm2={b0+R_0'',...,b7+R_7''}*/ \ |
michael@0 | 187 | __asm movq mm0,mm1 \ |
michael@0 | 188 | /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \ |
michael@0 | 189 | __asm punpcklbw mm1,mm2 \ |
michael@0 | 190 | /*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \ |
michael@0 | 191 | __asm punpckhbw mm0,mm2 \ |
michael@0 | 192 | /*[d]=c1 b1 c0 b0*/ \ |
michael@0 | 193 | __asm movd D,mm1 \ |
michael@0 | 194 | __asm mov [PIX+1],D_WORD \ |
michael@0 | 195 | __asm psrlq mm1,32 \ |
michael@0 | 196 | __asm shr D,16 \ |
michael@0 | 197 | __asm mov [PIX+YSTRIDE+1],D_WORD \ |
michael@0 | 198 | /*[d]=c3 b3 c2 b2*/ \ |
michael@0 | 199 | __asm movd D,mm1 \ |
michael@0 | 200 | __asm mov [PIX+YSTRIDE*2+1],D_WORD \ |
michael@0 | 201 | __asm shr D,16 \ |
michael@0 | 202 | __asm mov [PIX+YSTRIDE3+1],D_WORD \ |
michael@0 | 203 | __asm lea PIX,[PIX+YSTRIDE*4] \ |
michael@0 | 204 | /*[d]=c5 b5 c4 b4*/ \ |
michael@0 | 205 | __asm movd D,mm0 \ |
michael@0 | 206 | __asm mov [PIX+1],D_WORD \ |
michael@0 | 207 | __asm psrlq mm0,32 \ |
michael@0 | 208 | __asm shr D,16 \ |
michael@0 | 209 | __asm mov [PIX+YSTRIDE+1],D_WORD \ |
michael@0 | 210 | /*[d]=c7 b7 c6 b6*/ \ |
michael@0 | 211 | __asm movd D,mm0 \ |
michael@0 | 212 | __asm mov [PIX+YSTRIDE*2+1],D_WORD \ |
michael@0 | 213 | __asm shr D,16 \ |
michael@0 | 214 | __asm mov [PIX+YSTRIDE3+1],D_WORD \ |
michael@0 | 215 | } \ |
michael@0 | 216 | while(0) |
michael@0 | 217 | |
michael@0 | 218 | # endif |
michael@0 | 219 | #endif |