|
1 #ifndef MMX_X64_H_INCLUDED |
|
2 #define MMX_X64_H_INCLUDED |
|
3 |
|
4 /* Implementation of x64 MMX substitition functions, before |
|
5 * pixman is reimplemented not to use __m64 type on Visual C++ |
|
6 * |
|
7 * Copyright (C)2009 by George Yohng |
|
8 * Released in public domain. |
|
9 */ |
|
10 |
|
11 #include <intrin.h> |
|
12 |
|
13 #define M64C(a) (*(const __m64 *)(&a)) |
|
14 #define M64U(a) (*(const unsigned long long *)(&a)) |
|
15 |
|
16 __inline __m64 |
|
17 _m_from_int (int a) |
|
18 { |
|
19 long long i64 = a; |
|
20 |
|
21 return M64C (i64); |
|
22 } |
|
23 |
|
24 __inline __m64 |
|
25 _mm_setzero_si64 () |
|
26 { |
|
27 long long i64 = 0; |
|
28 |
|
29 return M64C (i64); |
|
30 } |
|
31 |
|
32 __inline __m64 |
|
33 _mm_set_pi32 (int i1, int i0) |
|
34 { |
|
35 unsigned long long i64 = ((unsigned)i0) + (((unsigned long long)(unsigned)i1) << 32); |
|
36 |
|
37 return M64C (i64); |
|
38 } |
|
39 |
|
40 __inline void |
|
41 _m_empty () |
|
42 { |
|
43 } |
|
44 |
|
45 __inline __m64 |
|
46 _mm_set1_pi16 (short w) |
|
47 { |
|
48 unsigned long long i64 = ((unsigned long long)(unsigned short)(w)) * 0x0001000100010001ULL; |
|
49 |
|
50 return M64C (i64); |
|
51 } |
|
52 |
|
53 __inline int |
|
54 _m_to_int (__m64 m) |
|
55 { |
|
56 return m.m64_i32[0]; |
|
57 } |
|
58 |
|
59 __inline __m64 |
|
60 _mm_movepi64_pi64 (__m128i a) |
|
61 { |
|
62 return M64C (a.m128i_i64[0]); |
|
63 } |
|
64 |
|
65 __inline __m64 |
|
66 _m_pand (__m64 a, __m64 b) |
|
67 { |
|
68 unsigned long long i64 = M64U (a) & M64U (b); |
|
69 |
|
70 return M64C (i64); |
|
71 } |
|
72 |
|
73 __inline __m64 |
|
74 _m_por (__m64 a, __m64 b) |
|
75 { |
|
76 unsigned long long i64 = M64U (a) | M64U (b); |
|
77 |
|
78 return M64C (i64); |
|
79 } |
|
80 |
|
81 __inline __m64 |
|
82 _m_pxor (__m64 a, __m64 b) |
|
83 { |
|
84 unsigned long long i64 = M64U (a) ^ M64U (b); |
|
85 |
|
86 return M64C (i64); |
|
87 } |
|
88 |
|
89 __inline __m64 |
|
90 _m_pmulhuw (__m64 a, __m64 b) /* unoptimized */ |
|
91 { |
|
92 unsigned short d[4] = |
|
93 { |
|
94 (unsigned short)((((unsigned)a.m64_u16[0]) * b.m64_u16[0]) >> 16), |
|
95 (unsigned short)((((unsigned)a.m64_u16[1]) * b.m64_u16[1]) >> 16), |
|
96 (unsigned short)((((unsigned)a.m64_u16[2]) * b.m64_u16[2]) >> 16), |
|
97 (unsigned short)((((unsigned)a.m64_u16[3]) * b.m64_u16[3]) >> 16) |
|
98 }; |
|
99 |
|
100 return M64C (d[0]); |
|
101 } |
|
102 |
|
103 __inline __m64 |
|
104 _m_pmullw2 (__m64 a, __m64 b) /* unoptimized */ |
|
105 { |
|
106 unsigned short d[4] = |
|
107 { |
|
108 (unsigned short)((((unsigned)a.m64_u16[0]) * b.m64_u16[0])), |
|
109 (unsigned short)((((unsigned)a.m64_u16[1]) * b.m64_u16[1])), |
|
110 (unsigned short)((((unsigned)a.m64_u16[2]) * b.m64_u16[2])), |
|
111 (unsigned short)((((unsigned)a.m64_u16[3]) * b.m64_u16[3])) |
|
112 }; |
|
113 |
|
114 return M64C (d[0]); |
|
115 } |
|
116 |
|
117 __inline __m64 |
|
118 _m_pmullw (__m64 a, __m64 b) /* unoptimized */ |
|
119 { |
|
120 unsigned long long x = |
|
121 ((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[0]) * b.m64_u16[0]))) + |
|
122 (((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[1]) * b.m64_u16[1]))) << 16) + |
|
123 (((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[2]) * b.m64_u16[2]))) << 32) + |
|
124 (((unsigned long long)(unsigned short)((((unsigned)a.m64_u16[3]) * b.m64_u16[3]))) << 48); |
|
125 |
|
126 return M64C (x); |
|
127 } |
|
128 |
|
129 __inline __m64 |
|
130 _m_paddusb (__m64 a, __m64 b) /* unoptimized */ |
|
131 { |
|
132 unsigned long long x = (M64U (a) & 0x00FF00FF00FF00FFULL) + |
|
133 (M64U (b) & 0x00FF00FF00FF00FFULL); |
|
134 |
|
135 unsigned long long y = ((M64U (a) >> 8) & 0x00FF00FF00FF00FFULL) + |
|
136 ((M64U (b) >> 8) & 0x00FF00FF00FF00FFULL); |
|
137 |
|
138 x |= ((x & 0xFF00FF00FF00FF00ULL) >> 8) * 0xFF; |
|
139 y |= ((y & 0xFF00FF00FF00FF00ULL) >> 8) * 0xFF; |
|
140 |
|
141 x = (x & 0x00FF00FF00FF00FFULL) | ((y & 0x00FF00FF00FF00FFULL) << 8); |
|
142 |
|
143 return M64C (x); |
|
144 } |
|
145 |
|
146 __inline __m64 |
|
147 _m_paddusw (__m64 a, __m64 b) /* unoptimized */ |
|
148 { |
|
149 unsigned long long x = (M64U (a) & 0x0000FFFF0000FFFFULL) + |
|
150 (M64U (b) & 0x0000FFFF0000FFFFULL); |
|
151 |
|
152 unsigned long long y = ((M64U (a) >> 16) & 0x0000FFFF0000FFFFULL) + |
|
153 ((M64U (b) >> 16) & 0x0000FFFF0000FFFFULL); |
|
154 |
|
155 x |= ((x & 0xFFFF0000FFFF0000) >> 16) * 0xFFFF; |
|
156 y |= ((y & 0xFFFF0000FFFF0000) >> 16) * 0xFFFF; |
|
157 |
|
158 x = (x & 0x0000FFFF0000FFFFULL) | ((y & 0x0000FFFF0000FFFFULL) << 16); |
|
159 |
|
160 return M64C (x); |
|
161 } |
|
162 |
|
163 __inline __m64 |
|
164 _m_pshufw (__m64 a, int n) /* unoptimized */ |
|
165 { |
|
166 unsigned short d[4] = |
|
167 { |
|
168 a.m64_u16[n & 3], |
|
169 a.m64_u16[(n >> 2) & 3], |
|
170 a.m64_u16[(n >> 4) & 3], |
|
171 a.m64_u16[(n >> 6) & 3] |
|
172 }; |
|
173 |
|
174 return M64C (d[0]); |
|
175 } |
|
176 |
|
177 __inline unsigned char |
|
178 sat16 (unsigned short d) |
|
179 { |
|
180 if (d > 0xFF) return 0xFF; |
|
181 else return d & 0xFF; |
|
182 } |
|
183 |
|
184 __inline __m64 |
|
185 _m_packuswb (__m64 m1, __m64 m2) /* unoptimized */ |
|
186 { |
|
187 unsigned char d[8] = |
|
188 { |
|
189 sat16 (m1.m64_u16[0]), |
|
190 sat16 (m1.m64_u16[1]), |
|
191 sat16 (m1.m64_u16[2]), |
|
192 sat16 (m1.m64_u16[3]), |
|
193 sat16 (m2.m64_u16[0]), |
|
194 sat16 (m2.m64_u16[1]), |
|
195 sat16 (m2.m64_u16[2]), |
|
196 sat16 (m2.m64_u16[3]) |
|
197 }; |
|
198 |
|
199 return M64C (d[0]); |
|
200 } |
|
201 |
|
202 __inline __m64 _m_punpcklbw (__m64 m1, __m64 m2) /* unoptimized */ |
|
203 { |
|
204 unsigned char d[8] = |
|
205 { |
|
206 m1.m64_u8[0], |
|
207 m2.m64_u8[0], |
|
208 m1.m64_u8[1], |
|
209 m2.m64_u8[1], |
|
210 m1.m64_u8[2], |
|
211 m2.m64_u8[2], |
|
212 m1.m64_u8[3], |
|
213 m2.m64_u8[3], |
|
214 }; |
|
215 |
|
216 return M64C (d[0]); |
|
217 } |
|
218 |
|
219 __inline __m64 _m_punpckhbw (__m64 m1, __m64 m2) /* unoptimized */ |
|
220 { |
|
221 unsigned char d[8] = |
|
222 { |
|
223 m1.m64_u8[4], |
|
224 m2.m64_u8[4], |
|
225 m1.m64_u8[5], |
|
226 m2.m64_u8[5], |
|
227 m1.m64_u8[6], |
|
228 m2.m64_u8[6], |
|
229 m1.m64_u8[7], |
|
230 m2.m64_u8[7], |
|
231 }; |
|
232 |
|
233 return M64C (d[0]); |
|
234 } |
|
235 |
|
236 __inline __m64 _m_psrlwi (__m64 a, int n) /* unoptimized */ |
|
237 { |
|
238 unsigned short d[4] = |
|
239 { |
|
240 a.m64_u16[0] >> n, |
|
241 a.m64_u16[1] >> n, |
|
242 a.m64_u16[2] >> n, |
|
243 a.m64_u16[3] >> n |
|
244 }; |
|
245 |
|
246 return M64C (d[0]); |
|
247 } |
|
248 |
|
249 __inline __m64 _m_psrlqi (__m64 m, int n) |
|
250 { |
|
251 unsigned long long x = M64U (m) >> n; |
|
252 |
|
253 return M64C (x); |
|
254 } |
|
255 |
|
256 __inline __m64 _m_psllqi (__m64 m, int n) |
|
257 { |
|
258 unsigned long long x = M64U (m) << n; |
|
259 |
|
260 return M64C (x); |
|
261 } |
|
262 |
|
263 #endif /* MMX_X64_H_INCLUDED */ |