media/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_subtract_sse2.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,127 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +%include "third_party/x86inc/x86inc.asm"
    1.15 +
    1.16 +SECTION .text
    1.17 +
    1.18 +; void vp9_subtract_block(int rows, int cols,
    1.19 +;                         int16_t *diff, ptrdiff_t diff_stride,
    1.20 +;                         const uint8_t *src, ptrdiff_t src_stride,
    1.21 +;                         const uint8_t *pred, ptrdiff_t pred_stride)
    1.22 +
    1.23 +INIT_XMM sse2
    1.24 +cglobal subtract_block, 7, 7, 8, \
    1.25 +                        rows, cols, diff, diff_stride, src, src_stride, \
    1.26 +                        pred, pred_stride
    1.27 +%define pred_str colsq
    1.28 +  pxor                  m7, m7         ; dedicated zero register
    1.29 +  cmp                colsd, 4
    1.30 +  je .case_4
    1.31 +  cmp                colsd, 8
    1.32 +  je .case_8
    1.33 +  cmp                colsd, 16
    1.34 +  je .case_16
    1.35 +  cmp                colsd, 32
    1.36 +  je .case_32
    1.37 +
    1.38 +%macro loop16 6
    1.39 +  mova                  m0, [srcq+%1]
    1.40 +  mova                  m4, [srcq+%2]
    1.41 +  mova                  m1, [predq+%3]
    1.42 +  mova                  m5, [predq+%4]
    1.43 +  punpckhbw             m2, m0, m7
    1.44 +  punpckhbw             m3, m1, m7
    1.45 +  punpcklbw             m0, m7
    1.46 +  punpcklbw             m1, m7
    1.47 +  psubw                 m2, m3
    1.48 +  psubw                 m0, m1
    1.49 +  punpckhbw             m1, m4, m7
    1.50 +  punpckhbw             m3, m5, m7
    1.51 +  punpcklbw             m4, m7
    1.52 +  punpcklbw             m5, m7
    1.53 +  psubw                 m1, m3
    1.54 +  psubw                 m4, m5
    1.55 +  mova [diffq+mmsize*0+%5], m0
    1.56 +  mova [diffq+mmsize*1+%5], m2
    1.57 +  mova [diffq+mmsize*0+%6], m4
    1.58 +  mova [diffq+mmsize*1+%6], m1
    1.59 +%endmacro
    1.60 +
    1.61 +  mov             pred_str, pred_stridemp
    1.62 +.loop_64:
    1.63 +  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
    1.64 +  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
    1.65 +  lea                diffq, [diffq+diff_strideq*2]
    1.66 +  add                predq, pred_str
    1.67 +  add                 srcq, src_strideq
    1.68 +  dec                rowsd
    1.69 +  jg .loop_64
    1.70 +  RET
    1.71 +
    1.72 +.case_32:
    1.73 +  mov             pred_str, pred_stridemp
    1.74 +.loop_32:
    1.75 +  loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
    1.76 +  lea                diffq, [diffq+diff_strideq*2]
    1.77 +  add                predq, pred_str
    1.78 +  add                 srcq, src_strideq
    1.79 +  dec                rowsd
    1.80 +  jg .loop_32
    1.81 +  RET
    1.82 +
    1.83 +.case_16:
    1.84 +  mov             pred_str, pred_stridemp
    1.85 +.loop_16:
    1.86 +  loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
    1.87 +  lea                diffq, [diffq+diff_strideq*4]
    1.88 +  lea                predq, [predq+pred_str*2]
    1.89 +  lea                 srcq, [srcq+src_strideq*2]
    1.90 +  sub                rowsd, 2
    1.91 +  jg .loop_16
    1.92 +  RET
    1.93 +
    1.94 +%macro loop_h 0
    1.95 +  movh                  m0, [srcq]
    1.96 +  movh                  m2, [srcq+src_strideq]
    1.97 +  movh                  m1, [predq]
    1.98 +  movh                  m3, [predq+pred_str]
    1.99 +  punpcklbw             m0, m7
   1.100 +  punpcklbw             m1, m7
   1.101 +  punpcklbw             m2, m7
   1.102 +  punpcklbw             m3, m7
   1.103 +  psubw                 m0, m1
   1.104 +  psubw                 m2, m3
   1.105 +  mova             [diffq], m0
   1.106 +  mova [diffq+diff_strideq*2], m2
   1.107 +%endmacro
   1.108 +
   1.109 +.case_8:
   1.110 +  mov             pred_str, pred_stridemp
   1.111 +.loop_8:
   1.112 +  loop_h
   1.113 +  lea                diffq, [diffq+diff_strideq*4]
   1.114 +  lea                 srcq, [srcq+src_strideq*2]
   1.115 +  lea                predq, [predq+pred_str*2]
   1.116 +  sub                rowsd, 2
   1.117 +  jg .loop_8
   1.118 +  RET
   1.119 +
   1.120 +INIT_MMX
   1.121 +.case_4:
   1.122 +  mov             pred_str, pred_stridemp
   1.123 +.loop_4:
   1.124 +  loop_h
   1.125 +  lea                diffq, [diffq+diff_strideq*4]
   1.126 +  lea                 srcq, [srcq+src_strideq*2]
   1.127 +  lea                predq, [predq+pred_str*2]
   1.128 +  sub                rowsd, 2
   1.129 +  jg .loop_4
   1.130 +  RET

mercurial