|
1 /* |
|
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 * |
|
4 * Use of this source code is governed by a BSD-style license |
|
5 * that can be found in the LICENSE file in the root of the source |
|
6 * tree. An additional intellectual property rights grant can be found |
|
7 * in the file PATENTS. All contributing project authors may |
|
8 * be found in the AUTHORS file in the root of the source tree. |
|
9 */ |
|
10 |
|
11 |
|
12 #include "vpx_config.h" |
|
13 #include "vp8_rtcd.h" |
|
14 #include "vpx_ports/mem.h" |
|
15 #include "filter_x86.h" |
|
16 |
|
17 extern const short vp8_six_tap_mmx[8][6*8]; |
|
18 |
|
19 extern void vp8_filter_block1d_h6_mmx |
|
20 ( |
|
21 unsigned char *src_ptr, |
|
22 unsigned short *output_ptr, |
|
23 unsigned int src_pixels_per_line, |
|
24 unsigned int pixel_step, |
|
25 unsigned int output_height, |
|
26 unsigned int output_width, |
|
27 const short *vp8_filter |
|
28 ); |
|
29 extern void vp8_filter_block1dc_v6_mmx |
|
30 ( |
|
31 unsigned short *src_ptr, |
|
32 unsigned char *output_ptr, |
|
33 int output_pitch, |
|
34 unsigned int pixels_per_line, |
|
35 unsigned int pixel_step, |
|
36 unsigned int output_height, |
|
37 unsigned int output_width, |
|
38 const short *vp8_filter |
|
39 ); |
|
40 extern void vp8_filter_block1d8_h6_sse2 |
|
41 ( |
|
42 unsigned char *src_ptr, |
|
43 unsigned short *output_ptr, |
|
44 unsigned int src_pixels_per_line, |
|
45 unsigned int pixel_step, |
|
46 unsigned int output_height, |
|
47 unsigned int output_width, |
|
48 const short *vp8_filter |
|
49 ); |
|
50 extern void vp8_filter_block1d16_h6_sse2 |
|
51 ( |
|
52 unsigned char *src_ptr, |
|
53 unsigned short *output_ptr, |
|
54 unsigned int src_pixels_per_line, |
|
55 unsigned int pixel_step, |
|
56 unsigned int output_height, |
|
57 unsigned int output_width, |
|
58 const short *vp8_filter |
|
59 ); |
|
60 extern void vp8_filter_block1d8_v6_sse2 |
|
61 ( |
|
62 unsigned short *src_ptr, |
|
63 unsigned char *output_ptr, |
|
64 int dst_ptich, |
|
65 unsigned int pixels_per_line, |
|
66 unsigned int pixel_step, |
|
67 unsigned int output_height, |
|
68 unsigned int output_width, |
|
69 const short *vp8_filter |
|
70 ); |
|
71 extern void vp8_filter_block1d16_v6_sse2 |
|
72 ( |
|
73 unsigned short *src_ptr, |
|
74 unsigned char *output_ptr, |
|
75 int dst_ptich, |
|
76 unsigned int pixels_per_line, |
|
77 unsigned int pixel_step, |
|
78 unsigned int output_height, |
|
79 unsigned int output_width, |
|
80 const short *vp8_filter |
|
81 ); |
|
82 extern void vp8_unpack_block1d16_h6_sse2 |
|
83 ( |
|
84 unsigned char *src_ptr, |
|
85 unsigned short *output_ptr, |
|
86 unsigned int src_pixels_per_line, |
|
87 unsigned int output_height, |
|
88 unsigned int output_width |
|
89 ); |
|
90 extern void vp8_filter_block1d8_h6_only_sse2 |
|
91 ( |
|
92 unsigned char *src_ptr, |
|
93 unsigned int src_pixels_per_line, |
|
94 unsigned char *output_ptr, |
|
95 int dst_ptich, |
|
96 unsigned int output_height, |
|
97 const short *vp8_filter |
|
98 ); |
|
99 extern void vp8_filter_block1d16_h6_only_sse2 |
|
100 ( |
|
101 unsigned char *src_ptr, |
|
102 unsigned int src_pixels_per_line, |
|
103 unsigned char *output_ptr, |
|
104 int dst_ptich, |
|
105 unsigned int output_height, |
|
106 const short *vp8_filter |
|
107 ); |
|
108 extern void vp8_filter_block1d8_v6_only_sse2 |
|
109 ( |
|
110 unsigned char *src_ptr, |
|
111 unsigned int src_pixels_per_line, |
|
112 unsigned char *output_ptr, |
|
113 int dst_ptich, |
|
114 unsigned int output_height, |
|
115 const short *vp8_filter |
|
116 ); |
|
117 |
|
118 |
|
119 #if HAVE_MMX |
|
120 void vp8_sixtap_predict4x4_mmx |
|
121 ( |
|
122 unsigned char *src_ptr, |
|
123 int src_pixels_per_line, |
|
124 int xoffset, |
|
125 int yoffset, |
|
126 unsigned char *dst_ptr, |
|
127 int dst_pitch |
|
128 ) |
|
129 { |
|
130 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 16*16); /* Temp data bufffer used in filtering */ |
|
131 const short *HFilter, *VFilter; |
|
132 HFilter = vp8_six_tap_mmx[xoffset]; |
|
133 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 8, HFilter); |
|
134 VFilter = vp8_six_tap_mmx[yoffset]; |
|
135 vp8_filter_block1dc_v6_mmx(FData2 + 8, dst_ptr, dst_pitch, 8, 4 , 4, 4, VFilter); |
|
136 |
|
137 } |
|
138 |
|
139 |
|
140 void vp8_sixtap_predict16x16_mmx |
|
141 ( |
|
142 unsigned char *src_ptr, |
|
143 int src_pixels_per_line, |
|
144 int xoffset, |
|
145 int yoffset, |
|
146 unsigned char *dst_ptr, |
|
147 int dst_pitch |
|
148 ) |
|
149 { |
|
150 |
|
151 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */ |
|
152 |
|
153 const short *HFilter, *VFilter; |
|
154 |
|
155 |
|
156 HFilter = vp8_six_tap_mmx[xoffset]; |
|
157 |
|
158 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); |
|
159 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 21, 32, HFilter); |
|
160 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8, FData2 + 8, src_pixels_per_line, 1, 21, 32, HFilter); |
|
161 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12, FData2 + 12, src_pixels_per_line, 1, 21, 32, HFilter); |
|
162 |
|
163 VFilter = vp8_six_tap_mmx[yoffset]; |
|
164 vp8_filter_block1dc_v6_mmx(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, 16, VFilter); |
|
165 vp8_filter_block1dc_v6_mmx(FData2 + 36, dst_ptr + 4, dst_pitch, 32, 16 , 16, 16, VFilter); |
|
166 vp8_filter_block1dc_v6_mmx(FData2 + 40, dst_ptr + 8, dst_pitch, 32, 16 , 16, 16, VFilter); |
|
167 vp8_filter_block1dc_v6_mmx(FData2 + 44, dst_ptr + 12, dst_pitch, 32, 16 , 16, 16, VFilter); |
|
168 |
|
169 } |
|
170 |
|
171 |
|
172 void vp8_sixtap_predict8x8_mmx |
|
173 ( |
|
174 unsigned char *src_ptr, |
|
175 int src_pixels_per_line, |
|
176 int xoffset, |
|
177 int yoffset, |
|
178 unsigned char *dst_ptr, |
|
179 int dst_pitch |
|
180 ) |
|
181 { |
|
182 |
|
183 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ |
|
184 |
|
185 const short *HFilter, *VFilter; |
|
186 |
|
187 HFilter = vp8_six_tap_mmx[xoffset]; |
|
188 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); |
|
189 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 13, 16, HFilter); |
|
190 |
|
191 VFilter = vp8_six_tap_mmx[yoffset]; |
|
192 vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, 8, VFilter); |
|
193 vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 8, 8, VFilter); |
|
194 |
|
195 } |
|
196 |
|
197 |
|
198 void vp8_sixtap_predict8x4_mmx |
|
199 ( |
|
200 unsigned char *src_ptr, |
|
201 int src_pixels_per_line, |
|
202 int xoffset, |
|
203 int yoffset, |
|
204 unsigned char *dst_ptr, |
|
205 int dst_pitch |
|
206 ) |
|
207 { |
|
208 |
|
209 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ |
|
210 |
|
211 const short *HFilter, *VFilter; |
|
212 |
|
213 HFilter = vp8_six_tap_mmx[xoffset]; |
|
214 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); |
|
215 vp8_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4, FData2 + 4, src_pixels_per_line, 1, 9, 16, HFilter); |
|
216 |
|
217 VFilter = vp8_six_tap_mmx[yoffset]; |
|
218 vp8_filter_block1dc_v6_mmx(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, 8, VFilter); |
|
219 vp8_filter_block1dc_v6_mmx(FData2 + 20, dst_ptr + 4, dst_pitch, 16, 8 , 4, 8, VFilter); |
|
220 |
|
221 } |
|
222 |
|
223 |
|
224 |
|
225 void vp8_bilinear_predict16x16_mmx |
|
226 ( |
|
227 unsigned char *src_ptr, |
|
228 int src_pixels_per_line, |
|
229 int xoffset, |
|
230 int yoffset, |
|
231 unsigned char *dst_ptr, |
|
232 int dst_pitch |
|
233 ) |
|
234 { |
|
235 vp8_bilinear_predict8x8_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pitch); |
|
236 vp8_bilinear_predict8x8_mmx(src_ptr + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + 8, dst_pitch); |
|
237 vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8, dst_pitch); |
|
238 vp8_bilinear_predict8x8_mmx(src_ptr + 8 * src_pixels_per_line + 8, src_pixels_per_line, xoffset, yoffset, dst_ptr + dst_pitch * 8 + 8, dst_pitch); |
|
239 } |
|
240 #endif |
|
241 |
|
242 |
|
243 #if HAVE_SSE2 |
|
244 void vp8_sixtap_predict16x16_sse2 |
|
245 ( |
|
246 unsigned char *src_ptr, |
|
247 int src_pixels_per_line, |
|
248 int xoffset, |
|
249 int yoffset, |
|
250 unsigned char *dst_ptr, |
|
251 int dst_pitch |
|
252 |
|
253 ) |
|
254 { |
|
255 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 24*24); /* Temp data bufffer used in filtering */ |
|
256 |
|
257 const short *HFilter, *VFilter; |
|
258 |
|
259 if (xoffset) |
|
260 { |
|
261 if (yoffset) |
|
262 { |
|
263 HFilter = vp8_six_tap_mmx[xoffset]; |
|
264 vp8_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 21, 32, HFilter); |
|
265 VFilter = vp8_six_tap_mmx[yoffset]; |
|
266 vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter); |
|
267 } |
|
268 else |
|
269 { |
|
270 /* First-pass only */ |
|
271 HFilter = vp8_six_tap_mmx[xoffset]; |
|
272 vp8_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 16, HFilter); |
|
273 } |
|
274 } |
|
275 else |
|
276 { |
|
277 /* Second-pass only */ |
|
278 VFilter = vp8_six_tap_mmx[yoffset]; |
|
279 vp8_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 21, 32); |
|
280 vp8_filter_block1d16_v6_sse2(FData2 + 32, dst_ptr, dst_pitch, 32, 16 , 16, dst_pitch, VFilter); |
|
281 } |
|
282 } |
|
283 |
|
284 |
|
285 void vp8_sixtap_predict8x8_sse2 |
|
286 ( |
|
287 unsigned char *src_ptr, |
|
288 int src_pixels_per_line, |
|
289 int xoffset, |
|
290 int yoffset, |
|
291 unsigned char *dst_ptr, |
|
292 int dst_pitch |
|
293 ) |
|
294 { |
|
295 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ |
|
296 const short *HFilter, *VFilter; |
|
297 |
|
298 if (xoffset) |
|
299 { |
|
300 if (yoffset) |
|
301 { |
|
302 HFilter = vp8_six_tap_mmx[xoffset]; |
|
303 vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 13, 16, HFilter); |
|
304 VFilter = vp8_six_tap_mmx[yoffset]; |
|
305 vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 8, dst_pitch, VFilter); |
|
306 } |
|
307 else |
|
308 { |
|
309 /* First-pass only */ |
|
310 HFilter = vp8_six_tap_mmx[xoffset]; |
|
311 vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 8, HFilter); |
|
312 } |
|
313 } |
|
314 else |
|
315 { |
|
316 /* Second-pass only */ |
|
317 VFilter = vp8_six_tap_mmx[yoffset]; |
|
318 vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 8, VFilter); |
|
319 } |
|
320 } |
|
321 |
|
322 |
|
323 void vp8_sixtap_predict8x4_sse2 |
|
324 ( |
|
325 unsigned char *src_ptr, |
|
326 int src_pixels_per_line, |
|
327 int xoffset, |
|
328 int yoffset, |
|
329 unsigned char *dst_ptr, |
|
330 int dst_pitch |
|
331 ) |
|
332 { |
|
333 DECLARE_ALIGNED_ARRAY(16, unsigned short, FData2, 256); /* Temp data bufffer used in filtering */ |
|
334 const short *HFilter, *VFilter; |
|
335 |
|
336 if (xoffset) |
|
337 { |
|
338 if (yoffset) |
|
339 { |
|
340 HFilter = vp8_six_tap_mmx[xoffset]; |
|
341 vp8_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), FData2, src_pixels_per_line, 1, 9, 16, HFilter); |
|
342 VFilter = vp8_six_tap_mmx[yoffset]; |
|
343 vp8_filter_block1d8_v6_sse2(FData2 + 16, dst_ptr, dst_pitch, 16, 8 , 4, dst_pitch, VFilter); |
|
344 } |
|
345 else |
|
346 { |
|
347 /* First-pass only */ |
|
348 HFilter = vp8_six_tap_mmx[xoffset]; |
|
349 vp8_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch, 4, HFilter); |
|
350 } |
|
351 } |
|
352 else |
|
353 { |
|
354 /* Second-pass only */ |
|
355 VFilter = vp8_six_tap_mmx[yoffset]; |
|
356 vp8_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line), src_pixels_per_line, dst_ptr, dst_pitch, 4, VFilter); |
|
357 } |
|
358 } |
|
359 |
|
360 #endif |
|
361 |
|
362 #if HAVE_SSSE3 |
|
363 |
|
364 extern void vp8_filter_block1d8_h6_ssse3 |
|
365 ( |
|
366 unsigned char *src_ptr, |
|
367 unsigned int src_pixels_per_line, |
|
368 unsigned char *output_ptr, |
|
369 unsigned int output_pitch, |
|
370 unsigned int output_height, |
|
371 unsigned int vp8_filter_index |
|
372 ); |
|
373 |
|
374 extern void vp8_filter_block1d16_h6_ssse3 |
|
375 ( |
|
376 unsigned char *src_ptr, |
|
377 unsigned int src_pixels_per_line, |
|
378 unsigned char *output_ptr, |
|
379 unsigned int output_pitch, |
|
380 unsigned int output_height, |
|
381 unsigned int vp8_filter_index |
|
382 ); |
|
383 |
|
384 extern void vp8_filter_block1d16_v6_ssse3 |
|
385 ( |
|
386 unsigned char *src_ptr, |
|
387 unsigned int src_pitch, |
|
388 unsigned char *output_ptr, |
|
389 unsigned int out_pitch, |
|
390 unsigned int output_height, |
|
391 unsigned int vp8_filter_index |
|
392 ); |
|
393 |
|
394 extern void vp8_filter_block1d8_v6_ssse3 |
|
395 ( |
|
396 unsigned char *src_ptr, |
|
397 unsigned int src_pitch, |
|
398 unsigned char *output_ptr, |
|
399 unsigned int out_pitch, |
|
400 unsigned int output_height, |
|
401 unsigned int vp8_filter_index |
|
402 ); |
|
403 |
|
404 extern void vp8_filter_block1d4_h6_ssse3 |
|
405 ( |
|
406 unsigned char *src_ptr, |
|
407 unsigned int src_pixels_per_line, |
|
408 unsigned char *output_ptr, |
|
409 unsigned int output_pitch, |
|
410 unsigned int output_height, |
|
411 unsigned int vp8_filter_index |
|
412 ); |
|
413 |
|
414 extern void vp8_filter_block1d4_v6_ssse3 |
|
415 ( |
|
416 unsigned char *src_ptr, |
|
417 unsigned int src_pitch, |
|
418 unsigned char *output_ptr, |
|
419 unsigned int out_pitch, |
|
420 unsigned int output_height, |
|
421 unsigned int vp8_filter_index |
|
422 ); |
|
423 |
|
424 void vp8_sixtap_predict16x16_ssse3 |
|
425 ( |
|
426 unsigned char *src_ptr, |
|
427 int src_pixels_per_line, |
|
428 int xoffset, |
|
429 int yoffset, |
|
430 unsigned char *dst_ptr, |
|
431 int dst_pitch |
|
432 |
|
433 ) |
|
434 { |
|
435 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 24*24); |
|
436 |
|
437 if (xoffset) |
|
438 { |
|
439 if (yoffset) |
|
440 { |
|
441 vp8_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line), |
|
442 src_pixels_per_line, FData2, |
|
443 16, 21, xoffset); |
|
444 vp8_filter_block1d16_v6_ssse3(FData2 , 16, dst_ptr, dst_pitch, |
|
445 16, yoffset); |
|
446 } |
|
447 else |
|
448 { |
|
449 /* First-pass only */ |
|
450 vp8_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line, |
|
451 dst_ptr, dst_pitch, 16, xoffset); |
|
452 } |
|
453 } |
|
454 else |
|
455 { |
|
456 if (yoffset) |
|
457 { |
|
458 /* Second-pass only */ |
|
459 vp8_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line), |
|
460 src_pixels_per_line, |
|
461 dst_ptr, dst_pitch, 16, yoffset); |
|
462 } |
|
463 else |
|
464 { |
|
465 /* ssse3 second-pass only function couldn't handle (xoffset==0 && |
|
466 * yoffset==0) case correctly. Add copy function here to guarantee |
|
467 * six-tap function handles all possible offsets. */ |
|
468 vp8_copy_mem16x16(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); |
|
469 } |
|
470 } |
|
471 } |
|
472 |
|
473 void vp8_sixtap_predict8x8_ssse3 |
|
474 ( |
|
475 unsigned char *src_ptr, |
|
476 int src_pixels_per_line, |
|
477 int xoffset, |
|
478 int yoffset, |
|
479 unsigned char *dst_ptr, |
|
480 int dst_pitch |
|
481 ) |
|
482 { |
|
483 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256); |
|
484 |
|
485 if (xoffset) |
|
486 { |
|
487 if (yoffset) |
|
488 { |
|
489 vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), |
|
490 src_pixels_per_line, FData2, |
|
491 8, 13, xoffset); |
|
492 vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, |
|
493 8, yoffset); |
|
494 } |
|
495 else |
|
496 { |
|
497 vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, |
|
498 dst_ptr, dst_pitch, 8, xoffset); |
|
499 } |
|
500 } |
|
501 else |
|
502 { |
|
503 if (yoffset) |
|
504 { |
|
505 /* Second-pass only */ |
|
506 vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), |
|
507 src_pixels_per_line, |
|
508 dst_ptr, dst_pitch, 8, yoffset); |
|
509 } |
|
510 else |
|
511 { |
|
512 /* ssse3 second-pass only function couldn't handle (xoffset==0 && |
|
513 * yoffset==0) case correctly. Add copy function here to guarantee |
|
514 * six-tap function handles all possible offsets. */ |
|
515 vp8_copy_mem8x8(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); |
|
516 } |
|
517 } |
|
518 } |
|
519 |
|
520 |
|
521 void vp8_sixtap_predict8x4_ssse3 |
|
522 ( |
|
523 unsigned char *src_ptr, |
|
524 int src_pixels_per_line, |
|
525 int xoffset, |
|
526 int yoffset, |
|
527 unsigned char *dst_ptr, |
|
528 int dst_pitch |
|
529 ) |
|
530 { |
|
531 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 256); |
|
532 |
|
533 if (xoffset) |
|
534 { |
|
535 if (yoffset) |
|
536 { |
|
537 vp8_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line), |
|
538 src_pixels_per_line, FData2, |
|
539 8, 9, xoffset); |
|
540 vp8_filter_block1d8_v6_ssse3(FData2, 8, dst_ptr, dst_pitch, |
|
541 4, yoffset); |
|
542 } |
|
543 else |
|
544 { |
|
545 /* First-pass only */ |
|
546 vp8_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line, |
|
547 dst_ptr, dst_pitch, 4, xoffset); |
|
548 } |
|
549 } |
|
550 else |
|
551 { |
|
552 if (yoffset) |
|
553 { |
|
554 /* Second-pass only */ |
|
555 vp8_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line), |
|
556 src_pixels_per_line, |
|
557 dst_ptr, dst_pitch, 4, yoffset); |
|
558 } |
|
559 else |
|
560 { |
|
561 /* ssse3 second-pass only function couldn't handle (xoffset==0 && |
|
562 * yoffset==0) case correctly. Add copy function here to guarantee |
|
563 * six-tap function handles all possible offsets. */ |
|
564 vp8_copy_mem8x4(src_ptr, src_pixels_per_line, dst_ptr, dst_pitch); |
|
565 } |
|
566 } |
|
567 } |
|
568 |
|
569 void vp8_sixtap_predict4x4_ssse3 |
|
570 ( |
|
571 unsigned char *src_ptr, |
|
572 int src_pixels_per_line, |
|
573 int xoffset, |
|
574 int yoffset, |
|
575 unsigned char *dst_ptr, |
|
576 int dst_pitch |
|
577 ) |
|
578 { |
|
579 DECLARE_ALIGNED_ARRAY(16, unsigned char, FData2, 4*9); |
|
580 |
|
581 if (xoffset) |
|
582 { |
|
583 if (yoffset) |
|
584 { |
|
585 vp8_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line), |
|
586 src_pixels_per_line, |
|
587 FData2, 4, 9, xoffset); |
|
588 vp8_filter_block1d4_v6_ssse3(FData2, 4, dst_ptr, dst_pitch, |
|
589 4, yoffset); |
|
590 } |
|
591 else |
|
592 { |
|
593 vp8_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line, |
|
594 dst_ptr, dst_pitch, 4, xoffset); |
|
595 } |
|
596 } |
|
597 else |
|
598 { |
|
599 if (yoffset) |
|
600 { |
|
601 vp8_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line), |
|
602 src_pixels_per_line, |
|
603 dst_ptr, dst_pitch, 4, yoffset); |
|
604 } |
|
605 else |
|
606 { |
|
607 /* ssse3 second-pass only function couldn't handle (xoffset==0 && |
|
608 * yoffset==0) case correctly. Add copy function here to guarantee |
|
609 * six-tap function handles all possible offsets. */ |
|
610 int r; |
|
611 |
|
612 for (r = 0; r < 4; r++) |
|
613 { |
|
614 dst_ptr[0] = src_ptr[0]; |
|
615 dst_ptr[1] = src_ptr[1]; |
|
616 dst_ptr[2] = src_ptr[2]; |
|
617 dst_ptr[3] = src_ptr[3]; |
|
618 dst_ptr += dst_pitch; |
|
619 src_ptr += src_pixels_per_line; |
|
620 } |
|
621 } |
|
622 } |
|
623 } |
|
624 |
|
625 #endif |