media/libvpx/vp9/common/x86/vp9_copy_sse2.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp9/common/x86/vp9_copy_sse2.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,152 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +%include "third_party/x86inc/x86inc.asm"
    1.15 +
    1.16 +SECTION .text
    1.17 +
    1.18 +%macro convolve_fn 1
    1.19 +INIT_XMM sse2
    1.20 +cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
    1.21 +                              fx, fxs, fy, fys, w, h
    1.22 +  mov r4d, dword wm
    1.23 +  cmp r4d, 4
    1.24 +  je .w4
    1.25 +  cmp r4d, 8
    1.26 +  je .w8
    1.27 +  cmp r4d, 16
    1.28 +  je .w16
    1.29 +  cmp r4d, 32
    1.30 +  je .w32
    1.31 +
    1.32 +  mov                    r4d, dword hm
    1.33 +.loop64:
    1.34 +  movu                    m0, [srcq]
    1.35 +  movu                    m1, [srcq+16]
    1.36 +  movu                    m2, [srcq+32]
    1.37 +  movu                    m3, [srcq+48]
    1.38 +  add                   srcq, src_strideq
    1.39 +%ifidn %1, avg
    1.40 +  pavgb                   m0, [dstq]
    1.41 +  pavgb                   m1, [dstq+16]
    1.42 +  pavgb                   m2, [dstq+32]
    1.43 +  pavgb                   m3, [dstq+48]
    1.44 +%endif
    1.45 +  mova             [dstq   ], m0
    1.46 +  mova             [dstq+16], m1
    1.47 +  mova             [dstq+32], m2
    1.48 +  mova             [dstq+48], m3
    1.49 +  add                   dstq, dst_strideq
    1.50 +  dec                    r4d
    1.51 +  jnz .loop64
    1.52 +  RET
    1.53 +
    1.54 +.w32:
    1.55 +  mov                    r4d, dword hm
    1.56 +.loop32:
    1.57 +  movu                    m0, [srcq]
    1.58 +  movu                    m1, [srcq+16]
    1.59 +  movu                    m2, [srcq+src_strideq]
    1.60 +  movu                    m3, [srcq+src_strideq+16]
    1.61 +  lea                   srcq, [srcq+src_strideq*2]
    1.62 +%ifidn %1, avg
    1.63 +  pavgb                   m0, [dstq]
    1.64 +  pavgb                   m1, [dstq            +16]
    1.65 +  pavgb                   m2, [dstq+dst_strideq]
    1.66 +  pavgb                   m3, [dstq+dst_strideq+16]
    1.67 +%endif
    1.68 +  mova [dstq               ], m0
    1.69 +  mova [dstq            +16], m1
    1.70 +  mova [dstq+dst_strideq   ], m2
    1.71 +  mova [dstq+dst_strideq+16], m3
    1.72 +  lea                   dstq, [dstq+dst_strideq*2]
    1.73 +  sub                    r4d, 2
    1.74 +  jnz .loop32
    1.75 +  RET
    1.76 +
    1.77 +.w16:
    1.78 +  mov                    r4d, dword hm
    1.79 +  lea                    r5q, [src_strideq*3]
    1.80 +  lea                    r6q, [dst_strideq*3]
    1.81 +.loop16:
    1.82 +  movu                    m0, [srcq]
    1.83 +  movu                    m1, [srcq+src_strideq]
    1.84 +  movu                    m2, [srcq+src_strideq*2]
    1.85 +  movu                    m3, [srcq+r5q]
    1.86 +  lea                   srcq, [srcq+src_strideq*4]
    1.87 +%ifidn %1, avg
    1.88 +  pavgb                   m0, [dstq]
    1.89 +  pavgb                   m1, [dstq+dst_strideq]
    1.90 +  pavgb                   m2, [dstq+dst_strideq*2]
    1.91 +  pavgb                   m3, [dstq+r6q]
    1.92 +%endif
    1.93 +  mova  [dstq              ], m0
    1.94 +  mova  [dstq+dst_strideq  ], m1
    1.95 +  mova  [dstq+dst_strideq*2], m2
    1.96 +  mova  [dstq+r6q          ], m3
    1.97 +  lea                   dstq, [dstq+dst_strideq*4]
    1.98 +  sub                    r4d, 4
    1.99 +  jnz .loop16
   1.100 +  RET
   1.101 +
   1.102 +INIT_MMX sse
   1.103 +.w8:
   1.104 +  mov                    r4d, dword hm
   1.105 +  lea                    r5q, [src_strideq*3]
   1.106 +  lea                    r6q, [dst_strideq*3]
   1.107 +.loop8:
   1.108 +  movu                    m0, [srcq]
   1.109 +  movu                    m1, [srcq+src_strideq]
   1.110 +  movu                    m2, [srcq+src_strideq*2]
   1.111 +  movu                    m3, [srcq+r5q]
   1.112 +  lea                   srcq, [srcq+src_strideq*4]
   1.113 +%ifidn %1, avg
   1.114 +  pavgb                   m0, [dstq]
   1.115 +  pavgb                   m1, [dstq+dst_strideq]
   1.116 +  pavgb                   m2, [dstq+dst_strideq*2]
   1.117 +  pavgb                   m3, [dstq+r6q]
   1.118 +%endif
   1.119 +  mova  [dstq              ], m0
   1.120 +  mova  [dstq+dst_strideq  ], m1
   1.121 +  mova  [dstq+dst_strideq*2], m2
   1.122 +  mova  [dstq+r6q          ], m3
   1.123 +  lea                   dstq, [dstq+dst_strideq*4]
   1.124 +  sub                    r4d, 4
   1.125 +  jnz .loop8
   1.126 +  RET
   1.127 +
   1.128 +.w4:
   1.129 +  mov                    r4d, dword hm
   1.130 +  lea                    r5q, [src_strideq*3]
   1.131 +  lea                    r6q, [dst_strideq*3]
   1.132 +.loop4:
   1.133 +  movh                    m0, [srcq]
   1.134 +  movh                    m1, [srcq+src_strideq]
   1.135 +  movh                    m2, [srcq+src_strideq*2]
   1.136 +  movh                    m3, [srcq+r5q]
   1.137 +  lea                   srcq, [srcq+src_strideq*4]
   1.138 +%ifidn %1, avg
   1.139 +  pavgb                   m0, [dstq]
   1.140 +  pavgb                   m1, [dstq+dst_strideq]
   1.141 +  pavgb                   m2, [dstq+dst_strideq*2]
   1.142 +  pavgb                   m3, [dstq+r6q]
   1.143 +%endif
   1.144 +  movh  [dstq              ], m0
   1.145 +  movh  [dstq+dst_strideq  ], m1
   1.146 +  movh  [dstq+dst_strideq*2], m2
   1.147 +  movh  [dstq+r6q          ], m3
   1.148 +  lea                   dstq, [dstq+dst_strideq*4]
   1.149 +  sub                    r4d, 4
   1.150 +  jnz .loop4
   1.151 +  RET
   1.152 +%endmacro
   1.153 +
   1.154 +convolve_fn copy
   1.155 +convolve_fn avg

mercurial