1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,127 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 +%include "third_party/x86inc/x86inc.asm" 1.15 + 1.16 +SECTION .text 1.17 + 1.18 +; void vp9_subtract_block(int rows, int cols, 1.19 +; int16_t *diff, ptrdiff_t diff_stride, 1.20 +; const uint8_t *src, ptrdiff_t src_stride, 1.21 +; const uint8_t *pred, ptrdiff_t pred_stride) 1.22 + 1.23 +INIT_XMM sse2 1.24 +cglobal subtract_block, 7, 7, 8, \ 1.25 + rows, cols, diff, diff_stride, src, src_stride, \ 1.26 + pred, pred_stride 1.27 +%define pred_str colsq 1.28 + pxor m7, m7 ; dedicated zero register 1.29 + cmp colsd, 4 1.30 + je .case_4 1.31 + cmp colsd, 8 1.32 + je .case_8 1.33 + cmp colsd, 16 1.34 + je .case_16 1.35 + cmp colsd, 32 1.36 + je .case_32 1.37 + 1.38 +%macro loop16 6 1.39 + mova m0, [srcq+%1] 1.40 + mova m4, [srcq+%2] 1.41 + mova m1, [predq+%3] 1.42 + mova m5, [predq+%4] 1.43 + punpckhbw m2, m0, m7 1.44 + punpckhbw m3, m1, m7 1.45 + punpcklbw m0, m7 1.46 + punpcklbw m1, m7 1.47 + psubw m2, m3 1.48 + psubw m0, m1 1.49 + punpckhbw m1, m4, m7 1.50 + punpckhbw m3, m5, m7 1.51 + punpcklbw m4, m7 1.52 + punpcklbw m5, m7 1.53 + psubw m1, m3 1.54 + psubw m4, m5 1.55 + mova [diffq+mmsize*0+%5], m0 1.56 + mova [diffq+mmsize*1+%5], m2 1.57 + mova [diffq+mmsize*0+%6], m4 1.58 + mova [diffq+mmsize*1+%6], m1 1.59 +%endmacro 1.60 + 1.61 + mov pred_str, pred_stridemp 1.62 +.loop_64: 1.63 + loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize 1.64 + loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize 1.65 + lea diffq, [diffq+diff_strideq*2] 1.66 + add predq, pred_str 1.67 + add srcq, src_strideq 1.68 + dec rowsd 1.69 + jg .loop_64 1.70 + RET 1.71 + 1.72 +.case_32: 1.73 + mov pred_str, pred_stridemp 1.74 +.loop_32: 1.75 + loop16 0, mmsize, 0, mmsize, 0, 2*mmsize 1.76 + lea diffq, [diffq+diff_strideq*2] 1.77 + add predq, pred_str 1.78 + add srcq, src_strideq 1.79 + dec rowsd 1.80 + jg .loop_32 1.81 + RET 1.82 + 1.83 +.case_16: 1.84 + mov pred_str, pred_stridemp 1.85 +.loop_16: 1.86 + loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 1.87 + lea diffq, [diffq+diff_strideq*4] 1.88 + lea predq, [predq+pred_str*2] 1.89 + lea srcq, [srcq+src_strideq*2] 1.90 + sub rowsd, 2 1.91 + jg .loop_16 1.92 + RET 1.93 + 1.94 +%macro loop_h 0 1.95 + movh m0, [srcq] 1.96 + movh m2, [srcq+src_strideq] 1.97 + movh m1, [predq] 1.98 + movh m3, [predq+pred_str] 1.99 + punpcklbw m0, m7 1.100 + punpcklbw m1, m7 1.101 + punpcklbw m2, m7 1.102 + punpcklbw m3, m7 1.103 + psubw m0, m1 1.104 + psubw m2, m3 1.105 + mova [diffq], m0 1.106 + mova [diffq+diff_strideq*2], m2 1.107 +%endmacro 1.108 + 1.109 +.case_8: 1.110 + mov pred_str, pred_stridemp 1.111 +.loop_8: 1.112 + loop_h 1.113 + lea diffq, [diffq+diff_strideq*4] 1.114 + lea srcq, [srcq+src_strideq*2] 1.115 + lea predq, [predq+pred_str*2] 1.116 + sub rowsd, 2 1.117 + jg .loop_8 1.118 + RET 1.119 + 1.120 +INIT_MMX 1.121 +.case_4: 1.122 + mov pred_str, pred_stridemp 1.123 +.loop_4: 1.124 + loop_h 1.125 + lea diffq, [diffq+diff_strideq*4] 1.126 + lea srcq, [srcq+src_strideq*2] 1.127 + lea predq, [predq+pred_str*2] 1.128 + sub rowsd, 2 1.129 + jg .loop_4 1.130 + RET