|
1 /* |
|
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 * |
|
4 * Use of this source code is governed by a BSD-style license |
|
5 * that can be found in the LICENSE file in the root of the source |
|
6 * tree. An additional intellectual property rights grant can be found |
|
7 * in the file PATENTS. All contributing project authors may |
|
8 * be found in the AUTHORS file in the root of the source tree. |
|
9 */ |
|
10 |
|
11 #include <assert.h> |
|
12 |
|
13 #include "./vpx_config.h" |
|
14 #include "./vp9_rtcd.h" |
|
15 #include "vpx_ports/mem.h" |
|
16 /////////////////////////////////////////////////////////////////////////// |
|
17 // the mmx function that does the bilinear filtering and var calculation // |
|
18 // int one pass // |
|
19 /////////////////////////////////////////////////////////////////////////// |
|
20 DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = { |
|
21 { 128, 128, 128, 128, 0, 0, 0, 0 }, |
|
22 { 120, 120, 120, 120, 8, 8, 8, 8 }, |
|
23 { 112, 112, 112, 112, 16, 16, 16, 16 }, |
|
24 { 104, 104, 104, 104, 24, 24, 24, 24 }, |
|
25 { 96, 96, 96, 96, 32, 32, 32, 32 }, |
|
26 { 88, 88, 88, 88, 40, 40, 40, 40 }, |
|
27 { 80, 80, 80, 80, 48, 48, 48, 48 }, |
|
28 { 72, 72, 72, 72, 56, 56, 56, 56 }, |
|
29 { 64, 64, 64, 64, 64, 64, 64, 64 }, |
|
30 { 56, 56, 56, 56, 72, 72, 72, 72 }, |
|
31 { 48, 48, 48, 48, 80, 80, 80, 80 }, |
|
32 { 40, 40, 40, 40, 88, 88, 88, 88 }, |
|
33 { 32, 32, 32, 32, 96, 96, 96, 96 }, |
|
34 { 24, 24, 24, 24, 104, 104, 104, 104 }, |
|
35 { 16, 16, 16, 16, 112, 112, 112, 112 }, |
|
36 { 8, 8, 8, 8, 120, 120, 120, 120 } |
|
37 }; |
|
38 |
|
39 typedef void filter8_1dfunction ( |
|
40 const unsigned char *src_ptr, |
|
41 const unsigned int src_pitch, |
|
42 unsigned char *output_ptr, |
|
43 unsigned int out_pitch, |
|
44 unsigned int output_height, |
|
45 const short *filter |
|
46 ); |
|
47 |
|
48 #if HAVE_SSSE3 |
|
49 filter8_1dfunction vp9_filter_block1d16_v8_ssse3; |
|
50 filter8_1dfunction vp9_filter_block1d16_h8_ssse3; |
|
51 filter8_1dfunction vp9_filter_block1d8_v8_ssse3; |
|
52 filter8_1dfunction vp9_filter_block1d8_h8_ssse3; |
|
53 filter8_1dfunction vp9_filter_block1d4_v8_ssse3; |
|
54 filter8_1dfunction vp9_filter_block1d4_h8_ssse3; |
|
55 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; |
|
56 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; |
|
57 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; |
|
58 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; |
|
59 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; |
|
60 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; |
|
61 |
|
62 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
|
63 uint8_t *dst, ptrdiff_t dst_stride, |
|
64 const int16_t *filter_x, int x_step_q4, |
|
65 const int16_t *filter_y, int y_step_q4, |
|
66 int w, int h) { |
|
67 /* Ensure the filter can be compressed to int16_t. */ |
|
68 if (x_step_q4 == 16 && filter_x[3] != 128) { |
|
69 while (w >= 16) { |
|
70 vp9_filter_block1d16_h8_ssse3(src, src_stride, |
|
71 dst, dst_stride, |
|
72 h, filter_x); |
|
73 src += 16; |
|
74 dst += 16; |
|
75 w -= 16; |
|
76 } |
|
77 while (w >= 8) { |
|
78 vp9_filter_block1d8_h8_ssse3(src, src_stride, |
|
79 dst, dst_stride, |
|
80 h, filter_x); |
|
81 src += 8; |
|
82 dst += 8; |
|
83 w -= 8; |
|
84 } |
|
85 while (w >= 4) { |
|
86 vp9_filter_block1d4_h8_ssse3(src, src_stride, |
|
87 dst, dst_stride, |
|
88 h, filter_x); |
|
89 src += 4; |
|
90 dst += 4; |
|
91 w -= 4; |
|
92 } |
|
93 } |
|
94 if (w) { |
|
95 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, |
|
96 filter_x, x_step_q4, filter_y, y_step_q4, |
|
97 w, h); |
|
98 } |
|
99 } |
|
100 |
|
101 void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
|
102 uint8_t *dst, ptrdiff_t dst_stride, |
|
103 const int16_t *filter_x, int x_step_q4, |
|
104 const int16_t *filter_y, int y_step_q4, |
|
105 int w, int h) { |
|
106 if (y_step_q4 == 16 && filter_y[3] != 128) { |
|
107 while (w >= 16) { |
|
108 vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride, |
|
109 dst, dst_stride, |
|
110 h, filter_y); |
|
111 src += 16; |
|
112 dst += 16; |
|
113 w -= 16; |
|
114 } |
|
115 while (w >= 8) { |
|
116 vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride, |
|
117 dst, dst_stride, |
|
118 h, filter_y); |
|
119 src += 8; |
|
120 dst += 8; |
|
121 w -= 8; |
|
122 } |
|
123 while (w >= 4) { |
|
124 vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride, |
|
125 dst, dst_stride, |
|
126 h, filter_y); |
|
127 src += 4; |
|
128 dst += 4; |
|
129 w -= 4; |
|
130 } |
|
131 } |
|
132 if (w) { |
|
133 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, |
|
134 filter_x, x_step_q4, filter_y, y_step_q4, |
|
135 w, h); |
|
136 } |
|
137 } |
|
138 |
|
139 void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
|
140 uint8_t *dst, ptrdiff_t dst_stride, |
|
141 const int16_t *filter_x, int x_step_q4, |
|
142 const int16_t *filter_y, int y_step_q4, |
|
143 int w, int h) { |
|
144 if (x_step_q4 == 16 && filter_x[3] != 128) { |
|
145 while (w >= 16) { |
|
146 vp9_filter_block1d16_h8_avg_ssse3(src, src_stride, |
|
147 dst, dst_stride, |
|
148 h, filter_x); |
|
149 src += 16; |
|
150 dst += 16; |
|
151 w -= 16; |
|
152 } |
|
153 while (w >= 8) { |
|
154 vp9_filter_block1d8_h8_avg_ssse3(src, src_stride, |
|
155 dst, dst_stride, |
|
156 h, filter_x); |
|
157 src += 8; |
|
158 dst += 8; |
|
159 w -= 8; |
|
160 } |
|
161 while (w >= 4) { |
|
162 vp9_filter_block1d4_h8_avg_ssse3(src, src_stride, |
|
163 dst, dst_stride, |
|
164 h, filter_x); |
|
165 src += 4; |
|
166 dst += 4; |
|
167 w -= 4; |
|
168 } |
|
169 } |
|
170 if (w) { |
|
171 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, |
|
172 filter_x, x_step_q4, filter_y, y_step_q4, |
|
173 w, h); |
|
174 } |
|
175 } |
|
176 |
|
177 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
|
178 uint8_t *dst, ptrdiff_t dst_stride, |
|
179 const int16_t *filter_x, int x_step_q4, |
|
180 const int16_t *filter_y, int y_step_q4, |
|
181 int w, int h) { |
|
182 if (y_step_q4 == 16 && filter_y[3] != 128) { |
|
183 while (w >= 16) { |
|
184 vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride, |
|
185 dst, dst_stride, |
|
186 h, filter_y); |
|
187 src += 16; |
|
188 dst += 16; |
|
189 w -= 16; |
|
190 } |
|
191 while (w >= 8) { |
|
192 vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride, |
|
193 dst, dst_stride, |
|
194 h, filter_y); |
|
195 src += 8; |
|
196 dst += 8; |
|
197 w -= 8; |
|
198 } |
|
199 while (w >= 4) { |
|
200 vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride, |
|
201 dst, dst_stride, |
|
202 h, filter_y); |
|
203 src += 4; |
|
204 dst += 4; |
|
205 w -= 4; |
|
206 } |
|
207 } |
|
208 if (w) { |
|
209 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, |
|
210 filter_x, x_step_q4, filter_y, y_step_q4, |
|
211 w, h); |
|
212 } |
|
213 } |
|
214 |
|
215 void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
|
216 uint8_t *dst, ptrdiff_t dst_stride, |
|
217 const int16_t *filter_x, int x_step_q4, |
|
218 const int16_t *filter_y, int y_step_q4, |
|
219 int w, int h) { |
|
220 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); |
|
221 |
|
222 assert(w <= 64); |
|
223 assert(h <= 64); |
|
224 if (x_step_q4 == 16 && y_step_q4 == 16) { |
|
225 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, |
|
226 filter_x, x_step_q4, filter_y, y_step_q4, |
|
227 w, h + 7); |
|
228 vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, |
|
229 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
|
230 } else { |
|
231 vp9_convolve8_c(src, src_stride, dst, dst_stride, |
|
232 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
|
233 } |
|
234 } |
|
235 |
|
236 void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, |
|
237 uint8_t *dst, ptrdiff_t dst_stride, |
|
238 const int16_t *filter_x, int x_step_q4, |
|
239 const int16_t *filter_y, int y_step_q4, |
|
240 int w, int h) { |
|
241 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); |
|
242 |
|
243 assert(w <= 64); |
|
244 assert(h <= 64); |
|
245 if (x_step_q4 == 16 && y_step_q4 == 16) { |
|
246 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64, |
|
247 filter_x, x_step_q4, filter_y, y_step_q4, |
|
248 w, h + 7); |
|
249 vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride, |
|
250 filter_x, x_step_q4, filter_y, y_step_q4, |
|
251 w, h); |
|
252 } else { |
|
253 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, |
|
254 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
|
255 } |
|
256 } |
|
257 #endif |
|
258 |
|
259 #if HAVE_SSE2 |
|
260 filter8_1dfunction vp9_filter_block1d16_v8_sse2; |
|
261 filter8_1dfunction vp9_filter_block1d16_h8_sse2; |
|
262 filter8_1dfunction vp9_filter_block1d8_v8_sse2; |
|
263 filter8_1dfunction vp9_filter_block1d8_h8_sse2; |
|
264 filter8_1dfunction vp9_filter_block1d4_v8_sse2; |
|
265 filter8_1dfunction vp9_filter_block1d4_h8_sse2; |
|
266 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2; |
|
267 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2; |
|
268 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2; |
|
269 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2; |
|
270 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2; |
|
271 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2; |
|
272 |
|
273 void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, |
|
274 uint8_t *dst, ptrdiff_t dst_stride, |
|
275 const int16_t *filter_x, int x_step_q4, |
|
276 const int16_t *filter_y, int y_step_q4, |
|
277 int w, int h) { |
|
278 /* Ensure the filter can be compressed to int16_t. */ |
|
279 if (x_step_q4 == 16 && filter_x[3] != 128) { |
|
280 while (w >= 16) { |
|
281 vp9_filter_block1d16_h8_sse2(src, src_stride, |
|
282 dst, dst_stride, |
|
283 h, filter_x); |
|
284 src += 16; |
|
285 dst += 16; |
|
286 w -= 16; |
|
287 } |
|
288 while (w >= 8) { |
|
289 vp9_filter_block1d8_h8_sse2(src, src_stride, |
|
290 dst, dst_stride, |
|
291 h, filter_x); |
|
292 src += 8; |
|
293 dst += 8; |
|
294 w -= 8; |
|
295 } |
|
296 while (w >= 4) { |
|
297 vp9_filter_block1d4_h8_sse2(src, src_stride, |
|
298 dst, dst_stride, |
|
299 h, filter_x); |
|
300 src += 4; |
|
301 dst += 4; |
|
302 w -= 4; |
|
303 } |
|
304 } |
|
305 if (w) { |
|
306 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, |
|
307 filter_x, x_step_q4, filter_y, y_step_q4, |
|
308 w, h); |
|
309 } |
|
310 } |
|
311 |
|
312 void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, |
|
313 uint8_t *dst, ptrdiff_t dst_stride, |
|
314 const int16_t *filter_x, int x_step_q4, |
|
315 const int16_t *filter_y, int y_step_q4, |
|
316 int w, int h) { |
|
317 if (y_step_q4 == 16 && filter_y[3] != 128) { |
|
318 while (w >= 16) { |
|
319 vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride, |
|
320 dst, dst_stride, |
|
321 h, filter_y); |
|
322 src += 16; |
|
323 dst += 16; |
|
324 w -= 16; |
|
325 } |
|
326 while (w >= 8) { |
|
327 vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride, |
|
328 dst, dst_stride, |
|
329 h, filter_y); |
|
330 src += 8; |
|
331 dst += 8; |
|
332 w -= 8; |
|
333 } |
|
334 while (w >= 4) { |
|
335 vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride, |
|
336 dst, dst_stride, |
|
337 h, filter_y); |
|
338 src += 4; |
|
339 dst += 4; |
|
340 w -= 4; |
|
341 } |
|
342 } |
|
343 if (w) { |
|
344 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, |
|
345 filter_x, x_step_q4, filter_y, y_step_q4, |
|
346 w, h); |
|
347 } |
|
348 } |
|
349 |
|
350 void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, |
|
351 uint8_t *dst, ptrdiff_t dst_stride, |
|
352 const int16_t *filter_x, int x_step_q4, |
|
353 const int16_t *filter_y, int y_step_q4, |
|
354 int w, int h) { |
|
355 if (x_step_q4 == 16 && filter_x[3] != 128) { |
|
356 while (w >= 16) { |
|
357 vp9_filter_block1d16_h8_avg_sse2(src, src_stride, |
|
358 dst, dst_stride, |
|
359 h, filter_x); |
|
360 src += 16; |
|
361 dst += 16; |
|
362 w -= 16; |
|
363 } |
|
364 while (w >= 8) { |
|
365 vp9_filter_block1d8_h8_avg_sse2(src, src_stride, |
|
366 dst, dst_stride, |
|
367 h, filter_x); |
|
368 src += 8; |
|
369 dst += 8; |
|
370 w -= 8; |
|
371 } |
|
372 while (w >= 4) { |
|
373 vp9_filter_block1d4_h8_avg_sse2(src, src_stride, |
|
374 dst, dst_stride, |
|
375 h, filter_x); |
|
376 src += 4; |
|
377 dst += 4; |
|
378 w -= 4; |
|
379 } |
|
380 } |
|
381 if (w) { |
|
382 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, |
|
383 filter_x, x_step_q4, filter_y, y_step_q4, |
|
384 w, h); |
|
385 } |
|
386 } |
|
387 |
|
388 void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, |
|
389 uint8_t *dst, ptrdiff_t dst_stride, |
|
390 const int16_t *filter_x, int x_step_q4, |
|
391 const int16_t *filter_y, int y_step_q4, |
|
392 int w, int h) { |
|
393 if (y_step_q4 == 16 && filter_y[3] != 128) { |
|
394 while (w >= 16) { |
|
395 vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride, |
|
396 dst, dst_stride, |
|
397 h, filter_y); |
|
398 src += 16; |
|
399 dst += 16; |
|
400 w -= 16; |
|
401 } |
|
402 while (w >= 8) { |
|
403 vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride, |
|
404 dst, dst_stride, |
|
405 h, filter_y); |
|
406 src += 8; |
|
407 dst += 8; |
|
408 w -= 8; |
|
409 } |
|
410 while (w >= 4) { |
|
411 vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride, |
|
412 dst, dst_stride, |
|
413 h, filter_y); |
|
414 src += 4; |
|
415 dst += 4; |
|
416 w -= 4; |
|
417 } |
|
418 } |
|
419 if (w) { |
|
420 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, |
|
421 filter_x, x_step_q4, filter_y, y_step_q4, |
|
422 w, h); |
|
423 } |
|
424 } |
|
425 |
|
426 void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, |
|
427 uint8_t *dst, ptrdiff_t dst_stride, |
|
428 const int16_t *filter_x, int x_step_q4, |
|
429 const int16_t *filter_y, int y_step_q4, |
|
430 int w, int h) { |
|
431 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); |
|
432 |
|
433 assert(w <= 64); |
|
434 assert(h <= 64); |
|
435 if (x_step_q4 == 16 && y_step_q4 == 16) { |
|
436 vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, |
|
437 filter_x, x_step_q4, filter_y, y_step_q4, |
|
438 w, h + 7); |
|
439 vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, |
|
440 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
|
441 } else { |
|
442 vp9_convolve8_c(src, src_stride, dst, dst_stride, |
|
443 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
|
444 } |
|
445 } |
|
446 |
|
447 void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, |
|
448 uint8_t *dst, ptrdiff_t dst_stride, |
|
449 const int16_t *filter_x, int x_step_q4, |
|
450 const int16_t *filter_y, int y_step_q4, |
|
451 int w, int h) { |
|
452 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71); |
|
453 |
|
454 assert(w <= 64); |
|
455 assert(h <= 64); |
|
456 if (x_step_q4 == 16 && y_step_q4 == 16) { |
|
457 vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64, |
|
458 filter_x, x_step_q4, filter_y, y_step_q4, |
|
459 w, h + 7); |
|
460 vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride, |
|
461 filter_x, x_step_q4, filter_y, y_step_q4, |
|
462 w, h); |
|
463 } else { |
|
464 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, |
|
465 filter_x, x_step_q4, filter_y, y_step_q4, w, h); |
|
466 } |
|
467 } |
|
468 #endif |