michael@0: ;
michael@0: ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0: ;
michael@0: ;  Use of this source code is governed by a BSD-style license
michael@0: ;  that can be found in the LICENSE file in the root of the source
michael@0: ;  tree. An additional intellectual property rights grant can be found
michael@0: ;  in the file PATENTS.  All contributing project authors may
michael@0: ;  be found in the AUTHORS file in the root of the source tree.
michael@0: ;
michael@0: 
michael@0: %include "third_party/x86inc/x86inc.asm"
michael@0: 
michael@0: SECTION .text
michael@0: 
michael@0: ; void vp9_subtract_block(int rows, int cols,
michael@0: ;                         int16_t *diff, ptrdiff_t diff_stride,
michael@0: ;                         const uint8_t *src, ptrdiff_t src_stride,
michael@0: ;                         const uint8_t *pred, ptrdiff_t pred_stride)
michael@0: 
michael@0: INIT_XMM sse2
michael@0: cglobal subtract_block, 7, 7, 8, \
michael@0:                         rows, cols, diff, diff_stride, src, src_stride, \
michael@0:                         pred, pred_stride
michael@0: %define pred_str colsq
michael@0:   pxor                  m7, m7         ; dedicated zero register
michael@0:   cmp                colsd, 4
michael@0:   je .case_4
michael@0:   cmp                colsd, 8
michael@0:   je .case_8
michael@0:   cmp                colsd, 16
michael@0:   je .case_16
michael@0:   cmp                colsd, 32
michael@0:   je .case_32
michael@0: 
michael@0: %macro loop16 6
michael@0:   mova                  m0, [srcq+%1]
michael@0:   mova                  m4, [srcq+%2]
michael@0:   mova                  m1, [predq+%3]
michael@0:   mova                  m5, [predq+%4]
michael@0:   punpckhbw             m2, m0, m7
michael@0:   punpckhbw             m3, m1, m7
michael@0:   punpcklbw             m0, m7
michael@0:   punpcklbw             m1, m7
michael@0:   psubw                 m2, m3
michael@0:   psubw                 m0, m1
michael@0:   punpckhbw             m1, m4, m7
michael@0:   punpckhbw             m3, m5, m7
michael@0:   punpcklbw             m4, m7
michael@0:   punpcklbw             m5, m7
michael@0:   psubw                 m1, m3
michael@0:   psubw                 m4, m5
michael@0:   mova [diffq+mmsize*0+%5], m0
michael@0:   mova [diffq+mmsize*1+%5], m2
michael@0:   mova [diffq+mmsize*0+%6], m4
michael@0:   mova [diffq+mmsize*1+%6], m1
michael@0: %endmacro
michael@0: 
michael@0:   mov             pred_str, pred_stridemp
michael@0: .loop_64:
michael@0:   loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
michael@0:   loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
michael@0:   lea                diffq, [diffq+diff_strideq*2]
michael@0:   add                predq, pred_str
michael@0:   add                 srcq, src_strideq
michael@0:   dec                rowsd
michael@0:   jg .loop_64
michael@0:   RET
michael@0: 
michael@0: .case_32:
michael@0:   mov             pred_str, pred_stridemp
michael@0: .loop_32:
michael@0:   loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
michael@0:   lea                diffq, [diffq+diff_strideq*2]
michael@0:   add                predq, pred_str
michael@0:   add                 srcq, src_strideq
michael@0:   dec                rowsd
michael@0:   jg .loop_32
michael@0:   RET
michael@0: 
michael@0: .case_16:
michael@0:   mov             pred_str, pred_stridemp
michael@0: .loop_16:
michael@0:   loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
michael@0:   lea                diffq, [diffq+diff_strideq*4]
michael@0:   lea                predq, [predq+pred_str*2]
michael@0:   lea                 srcq, [srcq+src_strideq*2]
michael@0:   sub                rowsd, 2
michael@0:   jg .loop_16
michael@0:   RET
michael@0: 
michael@0: %macro loop_h 0
michael@0:   movh                  m0, [srcq]
michael@0:   movh                  m2, [srcq+src_strideq]
michael@0:   movh                  m1, [predq]
michael@0:   movh                  m3, [predq+pred_str]
michael@0:   punpcklbw             m0, m7
michael@0:   punpcklbw             m1, m7
michael@0:   punpcklbw             m2, m7
michael@0:   punpcklbw             m3, m7
michael@0:   psubw                 m0, m1
michael@0:   psubw                 m2, m3
michael@0:   mova             [diffq], m0
michael@0:   mova [diffq+diff_strideq*2], m2
michael@0: %endmacro
michael@0: 
michael@0: .case_8:
michael@0:   mov             pred_str, pred_stridemp
michael@0: .loop_8:
michael@0:   loop_h
michael@0:   lea                diffq, [diffq+diff_strideq*4]
michael@0:   lea                 srcq, [srcq+src_strideq*2]
michael@0:   lea                predq, [predq+pred_str*2]
michael@0:   sub                rowsd, 2
michael@0:   jg .loop_8
michael@0:   RET
michael@0: 
michael@0: INIT_MMX
michael@0: .case_4:
michael@0:   mov             pred_str, pred_stridemp
michael@0: .loop_4:
michael@0:   loop_h
michael@0:   lea                diffq, [diffq+diff_strideq*4]
michael@0:   lea                 srcq, [srcq+src_strideq*2]
michael@0:   lea                predq, [predq+pred_str*2]
michael@0:   sub                rowsd, 2
michael@0:   jg .loop_4
michael@0:   RET