|
1 /* |
|
2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 * |
|
4 * Use of this source code is governed by a BSD-style license |
|
5 * that can be found in the LICENSE file in the root of the source |
|
6 * tree. An additional intellectual property rights grant can be found |
|
7 * in the file PATENTS. All contributing project authors may |
|
8 * be found in the AUTHORS file in the root of the source tree. |
|
9 */ |
|
10 |
|
11 #include "vpx_config.h" |
|
12 #include "vp8/common/variance.h" |
|
13 #include "vp8/common/pragmas.h" |
|
14 #include "vpx_ports/mem.h" |
|
15 #include "vp8/common/x86/filter_x86.h" |
|
16 |
|
17 extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); |
|
18 extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); |
|
19 extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); |
|
20 extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter); |
|
21 |
|
22 extern void vp8_filter_block2d_bil4x4_var_mmx |
|
23 ( |
|
24 const unsigned char *ref_ptr, |
|
25 int ref_pixels_per_line, |
|
26 const unsigned char *src_ptr, |
|
27 int src_pixels_per_line, |
|
28 const short *HFilter, |
|
29 const short *VFilter, |
|
30 int *sum, |
|
31 unsigned int *sumsquared |
|
32 ); |
|
33 |
|
34 extern unsigned int vp8_get4x4var_mmx |
|
35 ( |
|
36 const unsigned char *src_ptr, |
|
37 int source_stride, |
|
38 const unsigned char *ref_ptr, |
|
39 int recon_stride, |
|
40 unsigned int *SSE, |
|
41 int *Sum |
|
42 ); |
|
43 |
|
44 unsigned int vp8_get_mb_ss_sse2 |
|
45 ( |
|
46 const short *src_ptr |
|
47 ); |
|
48 unsigned int vp8_get16x16var_sse2 |
|
49 ( |
|
50 const unsigned char *src_ptr, |
|
51 int source_stride, |
|
52 const unsigned char *ref_ptr, |
|
53 int recon_stride, |
|
54 unsigned int *SSE, |
|
55 int *Sum |
|
56 ); |
|
57 unsigned int vp8_get8x8var_sse2 |
|
58 ( |
|
59 const unsigned char *src_ptr, |
|
60 int source_stride, |
|
61 const unsigned char *ref_ptr, |
|
62 int recon_stride, |
|
63 unsigned int *SSE, |
|
64 int *Sum |
|
65 ); |
|
66 void vp8_filter_block2d_bil_var_sse2 |
|
67 ( |
|
68 const unsigned char *ref_ptr, |
|
69 int ref_pixels_per_line, |
|
70 const unsigned char *src_ptr, |
|
71 int src_pixels_per_line, |
|
72 unsigned int Height, |
|
73 int xoffset, |
|
74 int yoffset, |
|
75 int *sum, |
|
76 unsigned int *sumsquared |
|
77 ); |
|
78 void vp8_half_horiz_vert_variance8x_h_sse2 |
|
79 ( |
|
80 const unsigned char *ref_ptr, |
|
81 int ref_pixels_per_line, |
|
82 const unsigned char *src_ptr, |
|
83 int src_pixels_per_line, |
|
84 unsigned int Height, |
|
85 int *sum, |
|
86 unsigned int *sumsquared |
|
87 ); |
|
88 void vp8_half_horiz_vert_variance16x_h_sse2 |
|
89 ( |
|
90 const unsigned char *ref_ptr, |
|
91 int ref_pixels_per_line, |
|
92 const unsigned char *src_ptr, |
|
93 int src_pixels_per_line, |
|
94 unsigned int Height, |
|
95 int *sum, |
|
96 unsigned int *sumsquared |
|
97 ); |
|
98 void vp8_half_horiz_variance8x_h_sse2 |
|
99 ( |
|
100 const unsigned char *ref_ptr, |
|
101 int ref_pixels_per_line, |
|
102 const unsigned char *src_ptr, |
|
103 int src_pixels_per_line, |
|
104 unsigned int Height, |
|
105 int *sum, |
|
106 unsigned int *sumsquared |
|
107 ); |
|
108 void vp8_half_horiz_variance16x_h_sse2 |
|
109 ( |
|
110 const unsigned char *ref_ptr, |
|
111 int ref_pixels_per_line, |
|
112 const unsigned char *src_ptr, |
|
113 int src_pixels_per_line, |
|
114 unsigned int Height, |
|
115 int *sum, |
|
116 unsigned int *sumsquared |
|
117 ); |
|
118 void vp8_half_vert_variance8x_h_sse2 |
|
119 ( |
|
120 const unsigned char *ref_ptr, |
|
121 int ref_pixels_per_line, |
|
122 const unsigned char *src_ptr, |
|
123 int src_pixels_per_line, |
|
124 unsigned int Height, |
|
125 int *sum, |
|
126 unsigned int *sumsquared |
|
127 ); |
|
128 void vp8_half_vert_variance16x_h_sse2 |
|
129 ( |
|
130 const unsigned char *ref_ptr, |
|
131 int ref_pixels_per_line, |
|
132 const unsigned char *src_ptr, |
|
133 int src_pixels_per_line, |
|
134 unsigned int Height, |
|
135 int *sum, |
|
136 unsigned int *sumsquared |
|
137 ); |
|
138 |
|
139 unsigned int vp8_variance4x4_wmt( |
|
140 const unsigned char *src_ptr, |
|
141 int source_stride, |
|
142 const unsigned char *ref_ptr, |
|
143 int recon_stride, |
|
144 unsigned int *sse) |
|
145 { |
|
146 unsigned int var; |
|
147 int avg; |
|
148 |
|
149 vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; |
|
150 *sse = var; |
|
151 return (var - (((unsigned int)avg * avg) >> 4)); |
|
152 |
|
153 } |
|
154 |
|
155 unsigned int vp8_variance8x8_wmt |
|
156 ( |
|
157 const unsigned char *src_ptr, |
|
158 int source_stride, |
|
159 const unsigned char *ref_ptr, |
|
160 int recon_stride, |
|
161 unsigned int *sse) |
|
162 { |
|
163 unsigned int var; |
|
164 int avg; |
|
165 |
|
166 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ; |
|
167 *sse = var; |
|
168 return (var - (((unsigned int)avg * avg) >> 6)); |
|
169 |
|
170 } |
|
171 |
|
172 |
|
173 unsigned int vp8_variance16x16_wmt |
|
174 ( |
|
175 const unsigned char *src_ptr, |
|
176 int source_stride, |
|
177 const unsigned char *ref_ptr, |
|
178 int recon_stride, |
|
179 unsigned int *sse) |
|
180 { |
|
181 unsigned int sse0; |
|
182 int sum0; |
|
183 |
|
184 |
|
185 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; |
|
186 *sse = sse0; |
|
187 return (sse0 - (((unsigned int)sum0 * sum0) >> 8)); |
|
188 } |
|
189 unsigned int vp8_mse16x16_wmt( |
|
190 const unsigned char *src_ptr, |
|
191 int source_stride, |
|
192 const unsigned char *ref_ptr, |
|
193 int recon_stride, |
|
194 unsigned int *sse) |
|
195 { |
|
196 |
|
197 unsigned int sse0; |
|
198 int sum0; |
|
199 vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; |
|
200 *sse = sse0; |
|
201 return sse0; |
|
202 |
|
203 } |
|
204 |
|
205 |
|
206 unsigned int vp8_variance16x8_wmt |
|
207 ( |
|
208 const unsigned char *src_ptr, |
|
209 int source_stride, |
|
210 const unsigned char *ref_ptr, |
|
211 int recon_stride, |
|
212 unsigned int *sse) |
|
213 { |
|
214 unsigned int sse0, sse1, var; |
|
215 int sum0, sum1, avg; |
|
216 |
|
217 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; |
|
218 vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1); |
|
219 |
|
220 var = sse0 + sse1; |
|
221 avg = sum0 + sum1; |
|
222 *sse = var; |
|
223 return (var - (((unsigned int)avg * avg) >> 7)); |
|
224 |
|
225 } |
|
226 |
|
227 unsigned int vp8_variance8x16_wmt |
|
228 ( |
|
229 const unsigned char *src_ptr, |
|
230 int source_stride, |
|
231 const unsigned char *ref_ptr, |
|
232 int recon_stride, |
|
233 unsigned int *sse) |
|
234 { |
|
235 unsigned int sse0, sse1, var; |
|
236 int sum0, sum1, avg; |
|
237 |
|
238 vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ; |
|
239 vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ; |
|
240 |
|
241 var = sse0 + sse1; |
|
242 avg = sum0 + sum1; |
|
243 *sse = var; |
|
244 return (var - (((unsigned int)avg * avg) >> 7)); |
|
245 |
|
246 } |
|
247 |
|
248 unsigned int vp8_sub_pixel_variance4x4_wmt |
|
249 ( |
|
250 const unsigned char *src_ptr, |
|
251 int src_pixels_per_line, |
|
252 int xoffset, |
|
253 int yoffset, |
|
254 const unsigned char *dst_ptr, |
|
255 int dst_pixels_per_line, |
|
256 unsigned int *sse |
|
257 ) |
|
258 { |
|
259 int xsum; |
|
260 unsigned int xxsum; |
|
261 vp8_filter_block2d_bil4x4_var_mmx( |
|
262 src_ptr, src_pixels_per_line, |
|
263 dst_ptr, dst_pixels_per_line, |
|
264 vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset], |
|
265 &xsum, &xxsum |
|
266 ); |
|
267 *sse = xxsum; |
|
268 return (xxsum - (((unsigned int)xsum * xsum) >> 4)); |
|
269 } |
|
270 |
|
271 |
|
272 unsigned int vp8_sub_pixel_variance8x8_wmt |
|
273 ( |
|
274 const unsigned char *src_ptr, |
|
275 int src_pixels_per_line, |
|
276 int xoffset, |
|
277 int yoffset, |
|
278 const unsigned char *dst_ptr, |
|
279 int dst_pixels_per_line, |
|
280 unsigned int *sse |
|
281 ) |
|
282 { |
|
283 int xsum; |
|
284 unsigned int xxsum; |
|
285 |
|
286 if (xoffset == 4 && yoffset == 0) |
|
287 { |
|
288 vp8_half_horiz_variance8x_h_sse2( |
|
289 src_ptr, src_pixels_per_line, |
|
290 dst_ptr, dst_pixels_per_line, 8, |
|
291 &xsum, &xxsum); |
|
292 } |
|
293 else if (xoffset == 0 && yoffset == 4) |
|
294 { |
|
295 vp8_half_vert_variance8x_h_sse2( |
|
296 src_ptr, src_pixels_per_line, |
|
297 dst_ptr, dst_pixels_per_line, 8, |
|
298 &xsum, &xxsum); |
|
299 } |
|
300 else if (xoffset == 4 && yoffset == 4) |
|
301 { |
|
302 vp8_half_horiz_vert_variance8x_h_sse2( |
|
303 src_ptr, src_pixels_per_line, |
|
304 dst_ptr, dst_pixels_per_line, 8, |
|
305 &xsum, &xxsum); |
|
306 } |
|
307 else |
|
308 { |
|
309 vp8_filter_block2d_bil_var_sse2( |
|
310 src_ptr, src_pixels_per_line, |
|
311 dst_ptr, dst_pixels_per_line, 8, |
|
312 xoffset, yoffset, |
|
313 &xsum, &xxsum); |
|
314 } |
|
315 |
|
316 *sse = xxsum; |
|
317 return (xxsum - (((unsigned int)xsum * xsum) >> 6)); |
|
318 } |
|
319 |
|
320 unsigned int vp8_sub_pixel_variance16x16_wmt |
|
321 ( |
|
322 const unsigned char *src_ptr, |
|
323 int src_pixels_per_line, |
|
324 int xoffset, |
|
325 int yoffset, |
|
326 const unsigned char *dst_ptr, |
|
327 int dst_pixels_per_line, |
|
328 unsigned int *sse |
|
329 ) |
|
330 { |
|
331 int xsum0, xsum1; |
|
332 unsigned int xxsum0, xxsum1; |
|
333 |
|
334 |
|
335 /* note we could avoid these if statements if the calling function |
|
336 * just called the appropriate functions inside. |
|
337 */ |
|
338 if (xoffset == 4 && yoffset == 0) |
|
339 { |
|
340 vp8_half_horiz_variance16x_h_sse2( |
|
341 src_ptr, src_pixels_per_line, |
|
342 dst_ptr, dst_pixels_per_line, 16, |
|
343 &xsum0, &xxsum0); |
|
344 } |
|
345 else if (xoffset == 0 && yoffset == 4) |
|
346 { |
|
347 vp8_half_vert_variance16x_h_sse2( |
|
348 src_ptr, src_pixels_per_line, |
|
349 dst_ptr, dst_pixels_per_line, 16, |
|
350 &xsum0, &xxsum0); |
|
351 } |
|
352 else if (xoffset == 4 && yoffset == 4) |
|
353 { |
|
354 vp8_half_horiz_vert_variance16x_h_sse2( |
|
355 src_ptr, src_pixels_per_line, |
|
356 dst_ptr, dst_pixels_per_line, 16, |
|
357 &xsum0, &xxsum0); |
|
358 } |
|
359 else |
|
360 { |
|
361 vp8_filter_block2d_bil_var_sse2( |
|
362 src_ptr, src_pixels_per_line, |
|
363 dst_ptr, dst_pixels_per_line, 16, |
|
364 xoffset, yoffset, |
|
365 &xsum0, &xxsum0 |
|
366 ); |
|
367 |
|
368 vp8_filter_block2d_bil_var_sse2( |
|
369 src_ptr + 8, src_pixels_per_line, |
|
370 dst_ptr + 8, dst_pixels_per_line, 16, |
|
371 xoffset, yoffset, |
|
372 &xsum1, &xxsum1 |
|
373 ); |
|
374 xsum0 += xsum1; |
|
375 xxsum0 += xxsum1; |
|
376 } |
|
377 |
|
378 *sse = xxsum0; |
|
379 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); |
|
380 } |
|
381 |
|
382 unsigned int vp8_sub_pixel_mse16x16_wmt( |
|
383 const unsigned char *src_ptr, |
|
384 int src_pixels_per_line, |
|
385 int xoffset, |
|
386 int yoffset, |
|
387 const unsigned char *dst_ptr, |
|
388 int dst_pixels_per_line, |
|
389 unsigned int *sse |
|
390 ) |
|
391 { |
|
392 vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); |
|
393 return *sse; |
|
394 } |
|
395 |
|
396 unsigned int vp8_sub_pixel_variance16x8_wmt |
|
397 ( |
|
398 const unsigned char *src_ptr, |
|
399 int src_pixels_per_line, |
|
400 int xoffset, |
|
401 int yoffset, |
|
402 const unsigned char *dst_ptr, |
|
403 int dst_pixels_per_line, |
|
404 unsigned int *sse |
|
405 |
|
406 ) |
|
407 { |
|
408 int xsum0, xsum1; |
|
409 unsigned int xxsum0, xxsum1; |
|
410 |
|
411 if (xoffset == 4 && yoffset == 0) |
|
412 { |
|
413 vp8_half_horiz_variance16x_h_sse2( |
|
414 src_ptr, src_pixels_per_line, |
|
415 dst_ptr, dst_pixels_per_line, 8, |
|
416 &xsum0, &xxsum0); |
|
417 } |
|
418 else if (xoffset == 0 && yoffset == 4) |
|
419 { |
|
420 vp8_half_vert_variance16x_h_sse2( |
|
421 src_ptr, src_pixels_per_line, |
|
422 dst_ptr, dst_pixels_per_line, 8, |
|
423 &xsum0, &xxsum0); |
|
424 } |
|
425 else if (xoffset == 4 && yoffset == 4) |
|
426 { |
|
427 vp8_half_horiz_vert_variance16x_h_sse2( |
|
428 src_ptr, src_pixels_per_line, |
|
429 dst_ptr, dst_pixels_per_line, 8, |
|
430 &xsum0, &xxsum0); |
|
431 } |
|
432 else |
|
433 { |
|
434 vp8_filter_block2d_bil_var_sse2( |
|
435 src_ptr, src_pixels_per_line, |
|
436 dst_ptr, dst_pixels_per_line, 8, |
|
437 xoffset, yoffset, |
|
438 &xsum0, &xxsum0); |
|
439 |
|
440 vp8_filter_block2d_bil_var_sse2( |
|
441 src_ptr + 8, src_pixels_per_line, |
|
442 dst_ptr + 8, dst_pixels_per_line, 8, |
|
443 xoffset, yoffset, |
|
444 &xsum1, &xxsum1); |
|
445 xsum0 += xsum1; |
|
446 xxsum0 += xxsum1; |
|
447 } |
|
448 |
|
449 *sse = xxsum0; |
|
450 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); |
|
451 } |
|
452 |
|
453 unsigned int vp8_sub_pixel_variance8x16_wmt |
|
454 ( |
|
455 const unsigned char *src_ptr, |
|
456 int src_pixels_per_line, |
|
457 int xoffset, |
|
458 int yoffset, |
|
459 const unsigned char *dst_ptr, |
|
460 int dst_pixels_per_line, |
|
461 unsigned int *sse |
|
462 ) |
|
463 { |
|
464 int xsum; |
|
465 unsigned int xxsum; |
|
466 |
|
467 if (xoffset == 4 && yoffset == 0) |
|
468 { |
|
469 vp8_half_horiz_variance8x_h_sse2( |
|
470 src_ptr, src_pixels_per_line, |
|
471 dst_ptr, dst_pixels_per_line, 16, |
|
472 &xsum, &xxsum); |
|
473 } |
|
474 else if (xoffset == 0 && yoffset == 4) |
|
475 { |
|
476 vp8_half_vert_variance8x_h_sse2( |
|
477 src_ptr, src_pixels_per_line, |
|
478 dst_ptr, dst_pixels_per_line, 16, |
|
479 &xsum, &xxsum); |
|
480 } |
|
481 else if (xoffset == 4 && yoffset == 4) |
|
482 { |
|
483 vp8_half_horiz_vert_variance8x_h_sse2( |
|
484 src_ptr, src_pixels_per_line, |
|
485 dst_ptr, dst_pixels_per_line, 16, |
|
486 &xsum, &xxsum); |
|
487 } |
|
488 else |
|
489 { |
|
490 vp8_filter_block2d_bil_var_sse2( |
|
491 src_ptr, src_pixels_per_line, |
|
492 dst_ptr, dst_pixels_per_line, 16, |
|
493 xoffset, yoffset, |
|
494 &xsum, &xxsum); |
|
495 } |
|
496 |
|
497 *sse = xxsum; |
|
498 return (xxsum - (((unsigned int)xsum * xsum) >> 7)); |
|
499 } |
|
500 |
|
501 |
|
502 unsigned int vp8_variance_halfpixvar16x16_h_wmt( |
|
503 const unsigned char *src_ptr, |
|
504 int src_pixels_per_line, |
|
505 const unsigned char *dst_ptr, |
|
506 int dst_pixels_per_line, |
|
507 unsigned int *sse) |
|
508 { |
|
509 int xsum0; |
|
510 unsigned int xxsum0; |
|
511 |
|
512 vp8_half_horiz_variance16x_h_sse2( |
|
513 src_ptr, src_pixels_per_line, |
|
514 dst_ptr, dst_pixels_per_line, 16, |
|
515 &xsum0, &xxsum0); |
|
516 |
|
517 *sse = xxsum0; |
|
518 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); |
|
519 } |
|
520 |
|
521 |
|
522 unsigned int vp8_variance_halfpixvar16x16_v_wmt( |
|
523 const unsigned char *src_ptr, |
|
524 int src_pixels_per_line, |
|
525 const unsigned char *dst_ptr, |
|
526 int dst_pixels_per_line, |
|
527 unsigned int *sse) |
|
528 { |
|
529 int xsum0; |
|
530 unsigned int xxsum0; |
|
531 vp8_half_vert_variance16x_h_sse2( |
|
532 src_ptr, src_pixels_per_line, |
|
533 dst_ptr, dst_pixels_per_line, 16, |
|
534 &xsum0, &xxsum0); |
|
535 |
|
536 *sse = xxsum0; |
|
537 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); |
|
538 } |
|
539 |
|
540 |
|
541 unsigned int vp8_variance_halfpixvar16x16_hv_wmt( |
|
542 const unsigned char *src_ptr, |
|
543 int src_pixels_per_line, |
|
544 const unsigned char *dst_ptr, |
|
545 int dst_pixels_per_line, |
|
546 unsigned int *sse) |
|
547 { |
|
548 int xsum0; |
|
549 unsigned int xxsum0; |
|
550 |
|
551 vp8_half_horiz_vert_variance16x_h_sse2( |
|
552 src_ptr, src_pixels_per_line, |
|
553 dst_ptr, dst_pixels_per_line, 16, |
|
554 &xsum0, &xxsum0); |
|
555 |
|
556 *sse = xxsum0; |
|
557 return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); |
|
558 } |