media/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_sad_sse2.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,267 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +%include "third_party/x86inc/x86inc.asm"
    1.15 +
    1.16 +SECTION .text
    1.17 +
    1.18 +%macro SAD_FN 4
    1.19 +%if %4 == 0
    1.20 +%if %3 == 5
    1.21 +cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
    1.22 +%else ; %3 == 7
    1.23 +cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \
    1.24 +                            src_stride3, ref_stride3, n_rows
    1.25 +%endif ; %3 == 5/7
    1.26 +%else ; avg
    1.27 +%if %3 == 5
    1.28 +cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
    1.29 +                                    second_pred, n_rows
    1.30 +%else ; %3 == 7
    1.31 +cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \
    1.32 +                                              ref, ref_stride, \
    1.33 +                                              second_pred, \
    1.34 +                                              src_stride3, ref_stride3
    1.35 +%if ARCH_X86_64
    1.36 +%define n_rowsd r7d
    1.37 +%else ; x86-32
    1.38 +%define n_rowsd dword r0m
    1.39 +%endif ; x86-32/64
    1.40 +%endif ; %3 == 5/7
    1.41 +%endif ; avg/sad
    1.42 +  movsxdifnidn src_strideq, src_strided
    1.43 +  movsxdifnidn ref_strideq, ref_strided
    1.44 +%if %3 == 7
    1.45 +  lea         src_stride3q, [src_strideq*3]
    1.46 +  lea         ref_stride3q, [ref_strideq*3]
    1.47 +%endif ; %3 == 7
    1.48 +%endmacro
    1.49 +
    1.50 +; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride,
    1.51 +;                                uint8_t *ref, int ref_stride);
    1.52 +%macro SAD64XN 1-2 0
    1.53 +  SAD_FN 64, %1, 5, %2
    1.54 +  mov              n_rowsd, %1
    1.55 +  pxor                  m0, m0
    1.56 +.loop:
    1.57 +  movu                  m1, [refq]
    1.58 +  movu                  m2, [refq+16]
    1.59 +  movu                  m3, [refq+32]
    1.60 +  movu                  m4, [refq+48]
    1.61 +%if %2 == 1
    1.62 +  pavgb                 m1, [second_predq+mmsize*0]
    1.63 +  pavgb                 m2, [second_predq+mmsize*1]
    1.64 +  pavgb                 m3, [second_predq+mmsize*2]
    1.65 +  pavgb                 m4, [second_predq+mmsize*3]
    1.66 +  lea         second_predq, [second_predq+mmsize*4]
    1.67 +%endif
    1.68 +  psadbw                m1, [srcq]
    1.69 +  psadbw                m2, [srcq+16]
    1.70 +  psadbw                m3, [srcq+32]
    1.71 +  psadbw                m4, [srcq+48]
    1.72 +  paddd                 m1, m2
    1.73 +  paddd                 m3, m4
    1.74 +  add                 refq, ref_strideq
    1.75 +  paddd                 m0, m1
    1.76 +  add                 srcq, src_strideq
    1.77 +  paddd                 m0, m3
    1.78 +  dec              n_rowsd
    1.79 +  jg .loop
    1.80 +
    1.81 +  movhlps               m1, m0
    1.82 +  paddd                 m0, m1
    1.83 +  movd                 eax, m0
    1.84 +  RET
    1.85 +%endmacro
    1.86 +
    1.87 +INIT_XMM sse2
    1.88 +SAD64XN 64 ; sad64x64_sse2
    1.89 +SAD64XN 32 ; sad64x32_sse2
    1.90 +SAD64XN 64, 1 ; sad64x64_avg_sse2
    1.91 +SAD64XN 32, 1 ; sad64x32_avg_sse2
    1.92 +
    1.93 +; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride,
    1.94 +;                                uint8_t *ref, int ref_stride);
    1.95 +%macro SAD32XN 1-2 0
    1.96 +  SAD_FN 32, %1, 5, %2
    1.97 +  mov              n_rowsd, %1/2
    1.98 +  pxor                  m0, m0
    1.99 +.loop:
   1.100 +  movu                  m1, [refq]
   1.101 +  movu                  m2, [refq+16]
   1.102 +  movu                  m3, [refq+ref_strideq]
   1.103 +  movu                  m4, [refq+ref_strideq+16]
   1.104 +%if %2 == 1
   1.105 +  pavgb                 m1, [second_predq+mmsize*0]
   1.106 +  pavgb                 m2, [second_predq+mmsize*1]
   1.107 +  pavgb                 m3, [second_predq+mmsize*2]
   1.108 +  pavgb                 m4, [second_predq+mmsize*3]
   1.109 +  lea         second_predq, [second_predq+mmsize*4]
   1.110 +%endif
   1.111 +  psadbw                m1, [srcq]
   1.112 +  psadbw                m2, [srcq+16]
   1.113 +  psadbw                m3, [srcq+src_strideq]
   1.114 +  psadbw                m4, [srcq+src_strideq+16]
   1.115 +  paddd                 m1, m2
   1.116 +  paddd                 m3, m4
   1.117 +  lea                 refq, [refq+ref_strideq*2]
   1.118 +  paddd                 m0, m1
   1.119 +  lea                 srcq, [srcq+src_strideq*2]
   1.120 +  paddd                 m0, m3
   1.121 +  dec              n_rowsd
   1.122 +  jg .loop
   1.123 +
   1.124 +  movhlps               m1, m0
   1.125 +  paddd                 m0, m1
   1.126 +  movd                 eax, m0
   1.127 +  RET
   1.128 +%endmacro
   1.129 +
   1.130 +INIT_XMM sse2
   1.131 +SAD32XN 64 ; sad32x64_sse2
   1.132 +SAD32XN 32 ; sad32x32_sse2
   1.133 +SAD32XN 16 ; sad32x16_sse2
   1.134 +SAD32XN 64, 1 ; sad32x64_avg_sse2
   1.135 +SAD32XN 32, 1 ; sad32x32_avg_sse2
   1.136 +SAD32XN 16, 1 ; sad32x16_avg_sse2
   1.137 +
   1.138 +; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
   1.139 +;                                    uint8_t *ref, int ref_stride);
   1.140 +%macro SAD16XN 1-2 0
   1.141 +  SAD_FN 16, %1, 7, %2
   1.142 +  mov              n_rowsd, %1/4
   1.143 +  pxor                  m0, m0
   1.144 +
   1.145 +.loop:
   1.146 +  movu                  m1, [refq]
   1.147 +  movu                  m2, [refq+ref_strideq]
   1.148 +  movu                  m3, [refq+ref_strideq*2]
   1.149 +  movu                  m4, [refq+ref_stride3q]
   1.150 +%if %2 == 1
   1.151 +  pavgb                 m1, [second_predq+mmsize*0]
   1.152 +  pavgb                 m2, [second_predq+mmsize*1]
   1.153 +  pavgb                 m3, [second_predq+mmsize*2]
   1.154 +  pavgb                 m4, [second_predq+mmsize*3]
   1.155 +  lea         second_predq, [second_predq+mmsize*4]
   1.156 +%endif
   1.157 +  psadbw                m1, [srcq]
   1.158 +  psadbw                m2, [srcq+src_strideq]
   1.159 +  psadbw                m3, [srcq+src_strideq*2]
   1.160 +  psadbw                m4, [srcq+src_stride3q]
   1.161 +  paddd                 m1, m2
   1.162 +  paddd                 m3, m4
   1.163 +  lea                 refq, [refq+ref_strideq*4]
   1.164 +  paddd                 m0, m1
   1.165 +  lea                 srcq, [srcq+src_strideq*4]
   1.166 +  paddd                 m0, m3
   1.167 +  dec              n_rowsd
   1.168 +  jg .loop
   1.169 +
   1.170 +  movhlps               m1, m0
   1.171 +  paddd                 m0, m1
   1.172 +  movd                 eax, m0
   1.173 +  RET
   1.174 +%endmacro
   1.175 +
   1.176 +INIT_XMM sse2
   1.177 +SAD16XN 32 ; sad16x32_sse2
   1.178 +SAD16XN 16 ; sad16x16_sse2
   1.179 +SAD16XN  8 ; sad16x8_sse2
   1.180 +SAD16XN 32, 1 ; sad16x32_avg_sse2
   1.181 +SAD16XN 16, 1 ; sad16x16_avg_sse2
   1.182 +SAD16XN  8, 1 ; sad16x8_avg_sse2
   1.183 +
   1.184 +; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
   1.185 +;                                   uint8_t *ref, int ref_stride);
   1.186 +%macro SAD8XN 1-2 0
   1.187 +  SAD_FN 8, %1, 7, %2
   1.188 +  mov              n_rowsd, %1/4
   1.189 +  pxor                  m0, m0
   1.190 +
   1.191 +.loop:
   1.192 +  movh                  m1, [refq]
   1.193 +  movhps                m1, [refq+ref_strideq]
   1.194 +  movh                  m2, [refq+ref_strideq*2]
   1.195 +  movhps                m2, [refq+ref_stride3q]
   1.196 +%if %2 == 1
   1.197 +  pavgb                 m1, [second_predq+mmsize*0]
   1.198 +  pavgb                 m2, [second_predq+mmsize*1]
   1.199 +  lea         second_predq, [second_predq+mmsize*2]
   1.200 +%endif
   1.201 +  movh                  m3, [srcq]
   1.202 +  movhps                m3, [srcq+src_strideq]
   1.203 +  movh                  m4, [srcq+src_strideq*2]
   1.204 +  movhps                m4, [srcq+src_stride3q]
   1.205 +  psadbw                m1, m3
   1.206 +  psadbw                m2, m4
   1.207 +  lea                 refq, [refq+ref_strideq*4]
   1.208 +  paddd                 m0, m1
   1.209 +  lea                 srcq, [srcq+src_strideq*4]
   1.210 +  paddd                 m0, m2
   1.211 +  dec              n_rowsd
   1.212 +  jg .loop
   1.213 +
   1.214 +  movhlps               m1, m0
   1.215 +  paddd                 m0, m1
   1.216 +  movd                 eax, m0
   1.217 +  RET
   1.218 +%endmacro
   1.219 +
   1.220 +INIT_XMM sse2
   1.221 +SAD8XN 16 ; sad8x16_sse2
   1.222 +SAD8XN  8 ; sad8x8_sse2
   1.223 +SAD8XN  4 ; sad8x4_sse2
   1.224 +SAD8XN 16, 1 ; sad8x16_avg_sse2
   1.225 +SAD8XN  8, 1 ; sad8x8_avg_sse2
   1.226 +SAD8XN  4, 1 ; sad8x4_avg_sse2
   1.227 +
   1.228 +; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride,
   1.229 +;                                  uint8_t *ref, int ref_stride);
   1.230 +%macro SAD4XN 1-2 0
   1.231 +  SAD_FN 4, %1, 7, %2
   1.232 +  mov              n_rowsd, %1/4
   1.233 +  pxor                  m0, m0
   1.234 +
   1.235 +.loop:
   1.236 +  movd                  m1, [refq]
   1.237 +  movd                  m2, [refq+ref_strideq]
   1.238 +  movd                  m3, [refq+ref_strideq*2]
   1.239 +  movd                  m4, [refq+ref_stride3q]
   1.240 +  punpckldq             m1, m2
   1.241 +  punpckldq             m3, m4
   1.242 +%if %2 == 1
   1.243 +  pavgb                 m1, [second_predq+mmsize*0]
   1.244 +  pavgb                 m3, [second_predq+mmsize*1]
   1.245 +  lea         second_predq, [second_predq+mmsize*2]
   1.246 +%endif
   1.247 +  movd                  m2, [srcq]
   1.248 +  movd                  m5, [srcq+src_strideq]
   1.249 +  movd                  m4, [srcq+src_strideq*2]
   1.250 +  movd                  m6, [srcq+src_stride3q]
   1.251 +  punpckldq             m2, m5
   1.252 +  punpckldq             m4, m6
   1.253 +  psadbw                m1, m2
   1.254 +  psadbw                m3, m4
   1.255 +  lea                 refq, [refq+ref_strideq*4]
   1.256 +  paddd                 m0, m1
   1.257 +  lea                 srcq, [srcq+src_strideq*4]
   1.258 +  paddd                 m0, m3
   1.259 +  dec              n_rowsd
   1.260 +  jg .loop
   1.261 +
   1.262 +  movd                 eax, m0
   1.263 +  RET
   1.264 +%endmacro
   1.265 +
   1.266 +INIT_MMX sse
   1.267 +SAD4XN  8 ; sad4x8_sse
   1.268 +SAD4XN  4 ; sad4x4_sse
   1.269 +SAD4XN  8, 1 ; sad4x8_avg_sse
   1.270 +SAD4XN  4, 1 ; sad4x4_avg_sse

mercurial