|
1 #if !defined(_x86_vc_mmxloop_H) |
|
2 # define _x86_vc_mmxloop_H (1) |
|
3 # include <stddef.h> |
|
4 # include "x86int.h" |
|
5 |
|
6 #if defined(OC_X86_ASM) |
|
7 |
|
8 /*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}. |
|
9 On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and |
|
10 mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/ |
|
11 #define OC_LOOP_FILTER8_MMX __asm{ \ |
|
12 /*mm7=0*/ \ |
|
13 __asm pxor mm7,mm7 \ |
|
14 /*mm6:mm0={a0,...,a7}*/ \ |
|
15 __asm movq mm6,mm0 \ |
|
16 __asm punpcklbw mm0,mm7 \ |
|
17 __asm punpckhbw mm6,mm7 \ |
|
18 /*mm3:mm5={d0,...,d7}*/ \ |
|
19 __asm movq mm5,mm3 \ |
|
20 __asm punpcklbw mm3,mm7 \ |
|
21 __asm punpckhbw mm5,mm7 \ |
|
22 /*mm6:mm0={a0-d0,...,a7-d7}*/ \ |
|
23 __asm psubw mm0,mm3 \ |
|
24 __asm psubw mm6,mm5 \ |
|
25 /*mm3:mm1={b0,...,b7}*/ \ |
|
26 __asm movq mm3,mm1 \ |
|
27 __asm punpcklbw mm1,mm7 \ |
|
28 __asm movq mm4,mm2 \ |
|
29 __asm punpckhbw mm3,mm7 \ |
|
30 /*mm5:mm4={c0,...,c7}*/ \ |
|
31 __asm movq mm5,mm2 \ |
|
32 __asm punpcklbw mm4,mm7 \ |
|
33 __asm punpckhbw mm5,mm7 \ |
|
34 /*mm7={3}x4 \ |
|
35 mm5:mm4={c0-b0,...,c7-b7}*/ \ |
|
36 __asm pcmpeqw mm7,mm7 \ |
|
37 __asm psubw mm4,mm1 \ |
|
38 __asm psrlw mm7,14 \ |
|
39 __asm psubw mm5,mm3 \ |
|
40 /*Scale by 3.*/ \ |
|
41 __asm pmullw mm4,mm7 \ |
|
42 __asm pmullw mm5,mm7 \ |
|
43 /*mm7={4}x4 \ |
|
44 mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \ |
|
45 __asm psrlw mm7,1 \ |
|
46 __asm paddw mm4,mm0 \ |
|
47 __asm psllw mm7,2 \ |
|
48 __asm movq mm0,[LL] \ |
|
49 __asm paddw mm5,mm6 \ |
|
50 /*R_i has the range [-127,128], so we compute -R_i instead. \ |
|
51 mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \ |
|
52 __asm psubw mm4,mm7 \ |
|
53 __asm psubw mm5,mm7 \ |
|
54 __asm psraw mm4,3 \ |
|
55 __asm psraw mm5,3 \ |
|
56 __asm pcmpeqb mm7,mm7 \ |
|
57 __asm packsswb mm4,mm5 \ |
|
58 __asm pxor mm6,mm6 \ |
|
59 __asm pxor mm4,mm7 \ |
|
60 __asm packuswb mm1,mm3 \ |
|
61 /*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \ |
|
62 /*There's no unsigned byte+signed byte with unsigned saturation op code, so \ |
|
63 we have to split things by sign (the other option is to work in 16 bits, \ |
|
64 but working in 8 bits gives much better parallelism). \ |
|
65 We compute abs(R_i), but save a mask of which terms were negative in mm6. \ |
|
66 Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \ |
|
67 Finally, we split mm4 into positive and negative pieces using the mask in \ |
|
68 mm6, and add and subtract them as appropriate.*/ \ |
|
69 /*mm4=abs(-R_i)*/ \ |
|
70 /*mm7=255-2*L*/ \ |
|
71 __asm pcmpgtb mm6,mm4 \ |
|
72 __asm psubb mm7,mm0 \ |
|
73 __asm pxor mm4,mm6 \ |
|
74 __asm psubb mm7,mm0 \ |
|
75 __asm psubb mm4,mm6 \ |
|
76 /*mm7=255-max(2*L-abs(R_i),0)*/ \ |
|
77 __asm paddusb mm7,mm4 \ |
|
78 /*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \ |
|
79 __asm paddusb mm4,mm7 \ |
|
80 __asm psubusb mm4,mm7 \ |
|
81 /*Now split mm4 by the original sign of -R_i.*/ \ |
|
82 __asm movq mm5,mm4 \ |
|
83 __asm pand mm4,mm6 \ |
|
84 __asm pandn mm6,mm5 \ |
|
85 /*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \ |
|
86 /*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \ |
|
87 __asm paddusb mm1,mm4 \ |
|
88 __asm psubusb mm2,mm4 \ |
|
89 __asm psubusb mm1,mm6 \ |
|
90 __asm paddusb mm2,mm6 \ |
|
91 } |
|
92 |
|
93 #define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \ |
|
94 do{ \ |
|
95 /*Used local variable pix__ in order to fix compilation errors like: \ |
|
96 "error C2425: 'SHL' : non-constant expression in 'second operand'".*/ \ |
|
97 unsigned char *pix__; \ |
|
98 unsigned char *ll__; \ |
|
99 ll__=(_ll); \ |
|
100 pix__=(_pix); \ |
|
101 __asm mov YSTRIDE,_ystride \ |
|
102 __asm mov LL,ll__ \ |
|
103 __asm mov PIX,pix__ \ |
|
104 __asm sub PIX,YSTRIDE \ |
|
105 __asm sub PIX,YSTRIDE \ |
|
106 /*mm0={a0,...,a7}*/ \ |
|
107 __asm movq mm0,[PIX] \ |
|
108 /*ystride3=_ystride*3*/ \ |
|
109 __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \ |
|
110 /*mm3={d0,...,d7}*/ \ |
|
111 __asm movq mm3,[PIX+YSTRIDE3] \ |
|
112 /*mm1={b0,...,b7}*/ \ |
|
113 __asm movq mm1,[PIX+YSTRIDE] \ |
|
114 /*mm2={c0,...,c7}*/ \ |
|
115 __asm movq mm2,[PIX+YSTRIDE*2] \ |
|
116 OC_LOOP_FILTER8_MMX \ |
|
117 /*Write it back out.*/ \ |
|
118 __asm movq [PIX+YSTRIDE],mm1 \ |
|
119 __asm movq [PIX+YSTRIDE*2],mm2 \ |
|
120 } \ |
|
121 while(0) |
|
122 |
|
123 #define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \ |
|
124 do{ \ |
|
125 /*Used local variable ll__ in order to fix compilation errors like: \ |
|
126 "error C2443: operand size conflict".*/ \ |
|
127 unsigned char *ll__; \ |
|
128 unsigned char *pix__; \ |
|
129 ll__=(_ll); \ |
|
130 pix__=(_pix)-2; \ |
|
131 __asm mov PIX,pix__ \ |
|
132 __asm mov YSTRIDE,_ystride \ |
|
133 __asm mov LL,ll__ \ |
|
134 /*x x x x d0 c0 b0 a0*/ \ |
|
135 __asm movd mm0,[PIX] \ |
|
136 /*x x x x d1 c1 b1 a1*/ \ |
|
137 __asm movd mm1,[PIX+YSTRIDE] \ |
|
138 /*ystride3=_ystride*3*/ \ |
|
139 __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \ |
|
140 /*x x x x d2 c2 b2 a2*/ \ |
|
141 __asm movd mm2,[PIX+YSTRIDE*2] \ |
|
142 /*x x x x d3 c3 b3 a3*/ \ |
|
143 __asm lea D,[PIX+YSTRIDE*4] \ |
|
144 __asm movd mm3,[PIX+YSTRIDE3] \ |
|
145 /*x x x x d4 c4 b4 a4*/ \ |
|
146 __asm movd mm4,[D] \ |
|
147 /*x x x x d5 c5 b5 a5*/ \ |
|
148 __asm movd mm5,[D+YSTRIDE] \ |
|
149 /*x x x x d6 c6 b6 a6*/ \ |
|
150 __asm movd mm6,[D+YSTRIDE*2] \ |
|
151 /*x x x x d7 c7 b7 a7*/ \ |
|
152 __asm movd mm7,[D+YSTRIDE3] \ |
|
153 /*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \ |
|
154 __asm punpcklbw mm0,mm1 \ |
|
155 /*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \ |
|
156 __asm punpcklbw mm2,mm3 \ |
|
157 /*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \ |
|
158 __asm movq mm3,mm0 \ |
|
159 /*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \ |
|
160 __asm punpcklwd mm0,mm2 \ |
|
161 /*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \ |
|
162 __asm punpckhwd mm3,mm2 \ |
|
163 /*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \ |
|
164 __asm movq mm1,mm0 \ |
|
165 /*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \ |
|
166 __asm punpcklbw mm4,mm5 \ |
|
167 /*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \ |
|
168 __asm punpcklbw mm6,mm7 \ |
|
169 /*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \ |
|
170 __asm movq mm5,mm4 \ |
|
171 /*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \ |
|
172 __asm punpcklwd mm4,mm6 \ |
|
173 /*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \ |
|
174 __asm punpckhwd mm5,mm6 \ |
|
175 /*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \ |
|
176 __asm movq mm2,mm3 \ |
|
177 /*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \ |
|
178 __asm punpckldq mm0,mm4 \ |
|
179 /*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \ |
|
180 __asm punpckhdq mm1,mm4 \ |
|
181 /*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \ |
|
182 __asm punpckldq mm2,mm5 \ |
|
183 /*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \ |
|
184 __asm punpckhdq mm3,mm5 \ |
|
185 OC_LOOP_FILTER8_MMX \ |
|
186 /*mm2={b0+R_0'',...,b7+R_7''}*/ \ |
|
187 __asm movq mm0,mm1 \ |
|
188 /*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \ |
|
189 __asm punpcklbw mm1,mm2 \ |
|
190 /*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \ |
|
191 __asm punpckhbw mm0,mm2 \ |
|
192 /*[d]=c1 b1 c0 b0*/ \ |
|
193 __asm movd D,mm1 \ |
|
194 __asm mov [PIX+1],D_WORD \ |
|
195 __asm psrlq mm1,32 \ |
|
196 __asm shr D,16 \ |
|
197 __asm mov [PIX+YSTRIDE+1],D_WORD \ |
|
198 /*[d]=c3 b3 c2 b2*/ \ |
|
199 __asm movd D,mm1 \ |
|
200 __asm mov [PIX+YSTRIDE*2+1],D_WORD \ |
|
201 __asm shr D,16 \ |
|
202 __asm mov [PIX+YSTRIDE3+1],D_WORD \ |
|
203 __asm lea PIX,[PIX+YSTRIDE*4] \ |
|
204 /*[d]=c5 b5 c4 b4*/ \ |
|
205 __asm movd D,mm0 \ |
|
206 __asm mov [PIX+1],D_WORD \ |
|
207 __asm psrlq mm0,32 \ |
|
208 __asm shr D,16 \ |
|
209 __asm mov [PIX+YSTRIDE+1],D_WORD \ |
|
210 /*[d]=c7 b7 c6 b6*/ \ |
|
211 __asm movd D,mm0 \ |
|
212 __asm mov [PIX+YSTRIDE*2+1],D_WORD \ |
|
213 __asm shr D,16 \ |
|
214 __asm mov [PIX+YSTRIDE3+1],D_WORD \ |
|
215 } \ |
|
216 while(0) |
|
217 |
|
218 # endif |
|
219 #endif |