media/libvpx/vp9/common/x86/vp9_asm_stubs.c

branch
TOR_BUG_9701
changeset 15
b8a032363ba2
equal deleted inserted replaced
-1:000000000000 0:55a46d1fd0c7
1 /*
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12
13 #include "./vpx_config.h"
14 #include "./vp9_rtcd.h"
15 #include "vpx_ports/mem.h"
16 ///////////////////////////////////////////////////////////////////////////
17 // the mmx function that does the bilinear filtering and var calculation //
18 // int one pass //
19 ///////////////////////////////////////////////////////////////////////////
20 DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
21 { 128, 128, 128, 128, 0, 0, 0, 0 },
22 { 120, 120, 120, 120, 8, 8, 8, 8 },
23 { 112, 112, 112, 112, 16, 16, 16, 16 },
24 { 104, 104, 104, 104, 24, 24, 24, 24 },
25 { 96, 96, 96, 96, 32, 32, 32, 32 },
26 { 88, 88, 88, 88, 40, 40, 40, 40 },
27 { 80, 80, 80, 80, 48, 48, 48, 48 },
28 { 72, 72, 72, 72, 56, 56, 56, 56 },
29 { 64, 64, 64, 64, 64, 64, 64, 64 },
30 { 56, 56, 56, 56, 72, 72, 72, 72 },
31 { 48, 48, 48, 48, 80, 80, 80, 80 },
32 { 40, 40, 40, 40, 88, 88, 88, 88 },
33 { 32, 32, 32, 32, 96, 96, 96, 96 },
34 { 24, 24, 24, 24, 104, 104, 104, 104 },
35 { 16, 16, 16, 16, 112, 112, 112, 112 },
36 { 8, 8, 8, 8, 120, 120, 120, 120 }
37 };
38
39 typedef void filter8_1dfunction (
40 const unsigned char *src_ptr,
41 const unsigned int src_pitch,
42 unsigned char *output_ptr,
43 unsigned int out_pitch,
44 unsigned int output_height,
45 const short *filter
46 );
47
48 #if HAVE_SSSE3
49 filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
50 filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
51 filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
52 filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
53 filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
54 filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
55 filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
56 filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
57 filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
58 filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
59 filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
60 filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
61
62 void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
63 uint8_t *dst, ptrdiff_t dst_stride,
64 const int16_t *filter_x, int x_step_q4,
65 const int16_t *filter_y, int y_step_q4,
66 int w, int h) {
67 /* Ensure the filter can be compressed to int16_t. */
68 if (x_step_q4 == 16 && filter_x[3] != 128) {
69 while (w >= 16) {
70 vp9_filter_block1d16_h8_ssse3(src, src_stride,
71 dst, dst_stride,
72 h, filter_x);
73 src += 16;
74 dst += 16;
75 w -= 16;
76 }
77 while (w >= 8) {
78 vp9_filter_block1d8_h8_ssse3(src, src_stride,
79 dst, dst_stride,
80 h, filter_x);
81 src += 8;
82 dst += 8;
83 w -= 8;
84 }
85 while (w >= 4) {
86 vp9_filter_block1d4_h8_ssse3(src, src_stride,
87 dst, dst_stride,
88 h, filter_x);
89 src += 4;
90 dst += 4;
91 w -= 4;
92 }
93 }
94 if (w) {
95 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
96 filter_x, x_step_q4, filter_y, y_step_q4,
97 w, h);
98 }
99 }
100
101 void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
102 uint8_t *dst, ptrdiff_t dst_stride,
103 const int16_t *filter_x, int x_step_q4,
104 const int16_t *filter_y, int y_step_q4,
105 int w, int h) {
106 if (y_step_q4 == 16 && filter_y[3] != 128) {
107 while (w >= 16) {
108 vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,
109 dst, dst_stride,
110 h, filter_y);
111 src += 16;
112 dst += 16;
113 w -= 16;
114 }
115 while (w >= 8) {
116 vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,
117 dst, dst_stride,
118 h, filter_y);
119 src += 8;
120 dst += 8;
121 w -= 8;
122 }
123 while (w >= 4) {
124 vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride,
125 dst, dst_stride,
126 h, filter_y);
127 src += 4;
128 dst += 4;
129 w -= 4;
130 }
131 }
132 if (w) {
133 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
134 filter_x, x_step_q4, filter_y, y_step_q4,
135 w, h);
136 }
137 }
138
139 void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
140 uint8_t *dst, ptrdiff_t dst_stride,
141 const int16_t *filter_x, int x_step_q4,
142 const int16_t *filter_y, int y_step_q4,
143 int w, int h) {
144 if (x_step_q4 == 16 && filter_x[3] != 128) {
145 while (w >= 16) {
146 vp9_filter_block1d16_h8_avg_ssse3(src, src_stride,
147 dst, dst_stride,
148 h, filter_x);
149 src += 16;
150 dst += 16;
151 w -= 16;
152 }
153 while (w >= 8) {
154 vp9_filter_block1d8_h8_avg_ssse3(src, src_stride,
155 dst, dst_stride,
156 h, filter_x);
157 src += 8;
158 dst += 8;
159 w -= 8;
160 }
161 while (w >= 4) {
162 vp9_filter_block1d4_h8_avg_ssse3(src, src_stride,
163 dst, dst_stride,
164 h, filter_x);
165 src += 4;
166 dst += 4;
167 w -= 4;
168 }
169 }
170 if (w) {
171 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
172 filter_x, x_step_q4, filter_y, y_step_q4,
173 w, h);
174 }
175 }
176
177 void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
178 uint8_t *dst, ptrdiff_t dst_stride,
179 const int16_t *filter_x, int x_step_q4,
180 const int16_t *filter_y, int y_step_q4,
181 int w, int h) {
182 if (y_step_q4 == 16 && filter_y[3] != 128) {
183 while (w >= 16) {
184 vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride,
185 dst, dst_stride,
186 h, filter_y);
187 src += 16;
188 dst += 16;
189 w -= 16;
190 }
191 while (w >= 8) {
192 vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride,
193 dst, dst_stride,
194 h, filter_y);
195 src += 8;
196 dst += 8;
197 w -= 8;
198 }
199 while (w >= 4) {
200 vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride,
201 dst, dst_stride,
202 h, filter_y);
203 src += 4;
204 dst += 4;
205 w -= 4;
206 }
207 }
208 if (w) {
209 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
210 filter_x, x_step_q4, filter_y, y_step_q4,
211 w, h);
212 }
213 }
214
215 void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
216 uint8_t *dst, ptrdiff_t dst_stride,
217 const int16_t *filter_x, int x_step_q4,
218 const int16_t *filter_y, int y_step_q4,
219 int w, int h) {
220 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
221
222 assert(w <= 64);
223 assert(h <= 64);
224 if (x_step_q4 == 16 && y_step_q4 == 16) {
225 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
226 filter_x, x_step_q4, filter_y, y_step_q4,
227 w, h + 7);
228 vp9_convolve8_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
229 filter_x, x_step_q4, filter_y, y_step_q4, w, h);
230 } else {
231 vp9_convolve8_c(src, src_stride, dst, dst_stride,
232 filter_x, x_step_q4, filter_y, y_step_q4, w, h);
233 }
234 }
235
236 void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
237 uint8_t *dst, ptrdiff_t dst_stride,
238 const int16_t *filter_x, int x_step_q4,
239 const int16_t *filter_y, int y_step_q4,
240 int w, int h) {
241 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
242
243 assert(w <= 64);
244 assert(h <= 64);
245 if (x_step_q4 == 16 && y_step_q4 == 16) {
246 vp9_convolve8_horiz_ssse3(src - 3 * src_stride, src_stride, fdata2, 64,
247 filter_x, x_step_q4, filter_y, y_step_q4,
248 w, h + 7);
249 vp9_convolve8_avg_vert_ssse3(fdata2 + 3 * 64, 64, dst, dst_stride,
250 filter_x, x_step_q4, filter_y, y_step_q4,
251 w, h);
252 } else {
253 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
254 filter_x, x_step_q4, filter_y, y_step_q4, w, h);
255 }
256 }
257 #endif
258
259 #if HAVE_SSE2
260 filter8_1dfunction vp9_filter_block1d16_v8_sse2;
261 filter8_1dfunction vp9_filter_block1d16_h8_sse2;
262 filter8_1dfunction vp9_filter_block1d8_v8_sse2;
263 filter8_1dfunction vp9_filter_block1d8_h8_sse2;
264 filter8_1dfunction vp9_filter_block1d4_v8_sse2;
265 filter8_1dfunction vp9_filter_block1d4_h8_sse2;
266 filter8_1dfunction vp9_filter_block1d16_v8_avg_sse2;
267 filter8_1dfunction vp9_filter_block1d16_h8_avg_sse2;
268 filter8_1dfunction vp9_filter_block1d8_v8_avg_sse2;
269 filter8_1dfunction vp9_filter_block1d8_h8_avg_sse2;
270 filter8_1dfunction vp9_filter_block1d4_v8_avg_sse2;
271 filter8_1dfunction vp9_filter_block1d4_h8_avg_sse2;
272
273 void vp9_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
274 uint8_t *dst, ptrdiff_t dst_stride,
275 const int16_t *filter_x, int x_step_q4,
276 const int16_t *filter_y, int y_step_q4,
277 int w, int h) {
278 /* Ensure the filter can be compressed to int16_t. */
279 if (x_step_q4 == 16 && filter_x[3] != 128) {
280 while (w >= 16) {
281 vp9_filter_block1d16_h8_sse2(src, src_stride,
282 dst, dst_stride,
283 h, filter_x);
284 src += 16;
285 dst += 16;
286 w -= 16;
287 }
288 while (w >= 8) {
289 vp9_filter_block1d8_h8_sse2(src, src_stride,
290 dst, dst_stride,
291 h, filter_x);
292 src += 8;
293 dst += 8;
294 w -= 8;
295 }
296 while (w >= 4) {
297 vp9_filter_block1d4_h8_sse2(src, src_stride,
298 dst, dst_stride,
299 h, filter_x);
300 src += 4;
301 dst += 4;
302 w -= 4;
303 }
304 }
305 if (w) {
306 vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
307 filter_x, x_step_q4, filter_y, y_step_q4,
308 w, h);
309 }
310 }
311
312 void vp9_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
313 uint8_t *dst, ptrdiff_t dst_stride,
314 const int16_t *filter_x, int x_step_q4,
315 const int16_t *filter_y, int y_step_q4,
316 int w, int h) {
317 if (y_step_q4 == 16 && filter_y[3] != 128) {
318 while (w >= 16) {
319 vp9_filter_block1d16_v8_sse2(src - src_stride * 3, src_stride,
320 dst, dst_stride,
321 h, filter_y);
322 src += 16;
323 dst += 16;
324 w -= 16;
325 }
326 while (w >= 8) {
327 vp9_filter_block1d8_v8_sse2(src - src_stride * 3, src_stride,
328 dst, dst_stride,
329 h, filter_y);
330 src += 8;
331 dst += 8;
332 w -= 8;
333 }
334 while (w >= 4) {
335 vp9_filter_block1d4_v8_sse2(src - src_stride * 3, src_stride,
336 dst, dst_stride,
337 h, filter_y);
338 src += 4;
339 dst += 4;
340 w -= 4;
341 }
342 }
343 if (w) {
344 vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
345 filter_x, x_step_q4, filter_y, y_step_q4,
346 w, h);
347 }
348 }
349
350 void vp9_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
351 uint8_t *dst, ptrdiff_t dst_stride,
352 const int16_t *filter_x, int x_step_q4,
353 const int16_t *filter_y, int y_step_q4,
354 int w, int h) {
355 if (x_step_q4 == 16 && filter_x[3] != 128) {
356 while (w >= 16) {
357 vp9_filter_block1d16_h8_avg_sse2(src, src_stride,
358 dst, dst_stride,
359 h, filter_x);
360 src += 16;
361 dst += 16;
362 w -= 16;
363 }
364 while (w >= 8) {
365 vp9_filter_block1d8_h8_avg_sse2(src, src_stride,
366 dst, dst_stride,
367 h, filter_x);
368 src += 8;
369 dst += 8;
370 w -= 8;
371 }
372 while (w >= 4) {
373 vp9_filter_block1d4_h8_avg_sse2(src, src_stride,
374 dst, dst_stride,
375 h, filter_x);
376 src += 4;
377 dst += 4;
378 w -= 4;
379 }
380 }
381 if (w) {
382 vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
383 filter_x, x_step_q4, filter_y, y_step_q4,
384 w, h);
385 }
386 }
387
388 void vp9_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
389 uint8_t *dst, ptrdiff_t dst_stride,
390 const int16_t *filter_x, int x_step_q4,
391 const int16_t *filter_y, int y_step_q4,
392 int w, int h) {
393 if (y_step_q4 == 16 && filter_y[3] != 128) {
394 while (w >= 16) {
395 vp9_filter_block1d16_v8_avg_sse2(src - src_stride * 3, src_stride,
396 dst, dst_stride,
397 h, filter_y);
398 src += 16;
399 dst += 16;
400 w -= 16;
401 }
402 while (w >= 8) {
403 vp9_filter_block1d8_v8_avg_sse2(src - src_stride * 3, src_stride,
404 dst, dst_stride,
405 h, filter_y);
406 src += 8;
407 dst += 8;
408 w -= 8;
409 }
410 while (w >= 4) {
411 vp9_filter_block1d4_v8_avg_sse2(src - src_stride * 3, src_stride,
412 dst, dst_stride,
413 h, filter_y);
414 src += 4;
415 dst += 4;
416 w -= 4;
417 }
418 }
419 if (w) {
420 vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
421 filter_x, x_step_q4, filter_y, y_step_q4,
422 w, h);
423 }
424 }
425
426 void vp9_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
427 uint8_t *dst, ptrdiff_t dst_stride,
428 const int16_t *filter_x, int x_step_q4,
429 const int16_t *filter_y, int y_step_q4,
430 int w, int h) {
431 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
432
433 assert(w <= 64);
434 assert(h <= 64);
435 if (x_step_q4 == 16 && y_step_q4 == 16) {
436 vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
437 filter_x, x_step_q4, filter_y, y_step_q4,
438 w, h + 7);
439 vp9_convolve8_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
440 filter_x, x_step_q4, filter_y, y_step_q4, w, h);
441 } else {
442 vp9_convolve8_c(src, src_stride, dst, dst_stride,
443 filter_x, x_step_q4, filter_y, y_step_q4, w, h);
444 }
445 }
446
447 void vp9_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
448 uint8_t *dst, ptrdiff_t dst_stride,
449 const int16_t *filter_x, int x_step_q4,
450 const int16_t *filter_y, int y_step_q4,
451 int w, int h) {
452 DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 64 * 71);
453
454 assert(w <= 64);
455 assert(h <= 64);
456 if (x_step_q4 == 16 && y_step_q4 == 16) {
457 vp9_convolve8_horiz_sse2(src - 3 * src_stride, src_stride, fdata2, 64,
458 filter_x, x_step_q4, filter_y, y_step_q4,
459 w, h + 7);
460 vp9_convolve8_avg_vert_sse2(fdata2 + 3 * 64, 64, dst, dst_stride,
461 filter_x, x_step_q4, filter_y, y_step_q4,
462 w, h);
463 } else {
464 vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
465 filter_x, x_step_q4, filter_y, y_step_q4, w, h);
466 }
467 }
468 #endif

mercurial