media/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,378 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +%include "vpx_ports/x86_abi_support.asm"
    1.15 +
    1.16 +%macro STACK_FRAME_CREATE_X3 0
    1.17 +%if ABI_IS_32BIT
    1.18 +  %define     src_ptr       rsi
    1.19 +  %define     src_stride    rax
    1.20 +  %define     ref_ptr       rdi
    1.21 +  %define     ref_stride    rdx
    1.22 +  %define     end_ptr       rcx
    1.23 +  %define     ret_var       rbx
    1.24 +  %define     result_ptr    arg(4)
    1.25 +  %define     max_err       arg(4)
    1.26 +  %define     height        dword ptr arg(4)
    1.27 +    push        rbp
    1.28 +    mov         rbp,        rsp
    1.29 +    push        rsi
    1.30 +    push        rdi
    1.31 +    push        rbx
    1.32 +
    1.33 +    mov         rsi,        arg(0)              ; src_ptr
    1.34 +    mov         rdi,        arg(2)              ; ref_ptr
    1.35 +
    1.36 +    movsxd      rax,        dword ptr arg(1)    ; src_stride
    1.37 +    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
    1.38 +%else
    1.39 +  %if LIBVPX_YASM_WIN64
    1.40 +    SAVE_XMM 7, u
    1.41 +    %define     src_ptr     rcx
    1.42 +    %define     src_stride  rdx
    1.43 +    %define     ref_ptr     r8
    1.44 +    %define     ref_stride  r9
    1.45 +    %define     end_ptr     r10
    1.46 +    %define     ret_var     r11
    1.47 +    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
    1.48 +    %define     max_err     [rsp+xmm_stack_space+8+4*8]
    1.49 +    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
    1.50 +  %else
    1.51 +    %define     src_ptr     rdi
    1.52 +    %define     src_stride  rsi
    1.53 +    %define     ref_ptr     rdx
    1.54 +    %define     ref_stride  rcx
    1.55 +    %define     end_ptr     r9
    1.56 +    %define     ret_var     r10
    1.57 +    %define     result_ptr  r8
    1.58 +    %define     max_err     r8
    1.59 +    %define     height      r8
    1.60 +  %endif
    1.61 +%endif
    1.62 +
    1.63 +%endmacro
    1.64 +
    1.65 +%macro STACK_FRAME_DESTROY_X3 0
    1.66 +  %define     src_ptr
    1.67 +  %define     src_stride
    1.68 +  %define     ref_ptr
    1.69 +  %define     ref_stride
    1.70 +  %define     end_ptr
    1.71 +  %define     ret_var
    1.72 +  %define     result_ptr
    1.73 +  %define     max_err
    1.74 +  %define     height
    1.75 +
    1.76 +%if ABI_IS_32BIT
    1.77 +    pop         rbx
    1.78 +    pop         rdi
    1.79 +    pop         rsi
    1.80 +    pop         rbp
    1.81 +%else
    1.82 +  %if LIBVPX_YASM_WIN64
    1.83 +    RESTORE_XMM
    1.84 +  %endif
    1.85 +%endif
    1.86 +    ret
    1.87 +%endmacro
    1.88 +
    1.89 +%macro PROCESS_16X2X3 5
    1.90 +%if %1==0
    1.91 +        movdqa          xmm0,       XMMWORD PTR [%2]
    1.92 +        lddqu           xmm5,       XMMWORD PTR [%3]
    1.93 +        lddqu           xmm6,       XMMWORD PTR [%3+1]
    1.94 +        lddqu           xmm7,       XMMWORD PTR [%3+2]
    1.95 +
    1.96 +        psadbw          xmm5,       xmm0
    1.97 +        psadbw          xmm6,       xmm0
    1.98 +        psadbw          xmm7,       xmm0
    1.99 +%else
   1.100 +        movdqa          xmm0,       XMMWORD PTR [%2]
   1.101 +        lddqu           xmm1,       XMMWORD PTR [%3]
   1.102 +        lddqu           xmm2,       XMMWORD PTR [%3+1]
   1.103 +        lddqu           xmm3,       XMMWORD PTR [%3+2]
   1.104 +
   1.105 +        psadbw          xmm1,       xmm0
   1.106 +        psadbw          xmm2,       xmm0
   1.107 +        psadbw          xmm3,       xmm0
   1.108 +
   1.109 +        paddw           xmm5,       xmm1
   1.110 +        paddw           xmm6,       xmm2
   1.111 +        paddw           xmm7,       xmm3
   1.112 +%endif
   1.113 +        movdqa          xmm0,       XMMWORD PTR [%2+%4]
   1.114 +        lddqu           xmm1,       XMMWORD PTR [%3+%5]
   1.115 +        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
   1.116 +        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
   1.117 +
   1.118 +%if %1==0 || %1==1
   1.119 +        lea             %2,         [%2+%4*2]
   1.120 +        lea             %3,         [%3+%5*2]
   1.121 +%endif
   1.122 +
   1.123 +        psadbw          xmm1,       xmm0
   1.124 +        psadbw          xmm2,       xmm0
   1.125 +        psadbw          xmm3,       xmm0
   1.126 +
   1.127 +        paddw           xmm5,       xmm1
   1.128 +        paddw           xmm6,       xmm2
   1.129 +        paddw           xmm7,       xmm3
   1.130 +%endmacro
   1.131 +
   1.132 +%macro PROCESS_8X2X3 5
   1.133 +%if %1==0
   1.134 +        movq            mm0,       QWORD PTR [%2]
   1.135 +        movq            mm5,       QWORD PTR [%3]
   1.136 +        movq            mm6,       QWORD PTR [%3+1]
   1.137 +        movq            mm7,       QWORD PTR [%3+2]
   1.138 +
   1.139 +        psadbw          mm5,       mm0
   1.140 +        psadbw          mm6,       mm0
   1.141 +        psadbw          mm7,       mm0
   1.142 +%else
   1.143 +        movq            mm0,       QWORD PTR [%2]
   1.144 +        movq            mm1,       QWORD PTR [%3]
   1.145 +        movq            mm2,       QWORD PTR [%3+1]
   1.146 +        movq            mm3,       QWORD PTR [%3+2]
   1.147 +
   1.148 +        psadbw          mm1,       mm0
   1.149 +        psadbw          mm2,       mm0
   1.150 +        psadbw          mm3,       mm0
   1.151 +
   1.152 +        paddw           mm5,       mm1
   1.153 +        paddw           mm6,       mm2
   1.154 +        paddw           mm7,       mm3
   1.155 +%endif
   1.156 +        movq            mm0,       QWORD PTR [%2+%4]
   1.157 +        movq            mm1,       QWORD PTR [%3+%5]
   1.158 +        movq            mm2,       QWORD PTR [%3+%5+1]
   1.159 +        movq            mm3,       QWORD PTR [%3+%5+2]
   1.160 +
   1.161 +%if %1==0 || %1==1
   1.162 +        lea             %2,        [%2+%4*2]
   1.163 +        lea             %3,        [%3+%5*2]
   1.164 +%endif
   1.165 +
   1.166 +        psadbw          mm1,       mm0
   1.167 +        psadbw          mm2,       mm0
   1.168 +        psadbw          mm3,       mm0
   1.169 +
   1.170 +        paddw           mm5,       mm1
   1.171 +        paddw           mm6,       mm2
   1.172 +        paddw           mm7,       mm3
   1.173 +%endmacro
   1.174 +
   1.175 +;void int vp9_sad16x16x3_sse3(
   1.176 +;    unsigned char *src_ptr,
   1.177 +;    int  src_stride,
   1.178 +;    unsigned char *ref_ptr,
   1.179 +;    int  ref_stride,
   1.180 +;    int  *results)
   1.181 +global sym(vp9_sad16x16x3_sse3) PRIVATE
   1.182 +sym(vp9_sad16x16x3_sse3):
   1.183 +
   1.184 +    STACK_FRAME_CREATE_X3
   1.185 +
   1.186 +        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
   1.187 +        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   1.188 +        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   1.189 +        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   1.190 +        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   1.191 +        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   1.192 +        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   1.193 +        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
   1.194 +
   1.195 +        mov             rcx,        result_ptr
   1.196 +
   1.197 +        movq            xmm0,       xmm5
   1.198 +        psrldq          xmm5,       8
   1.199 +
   1.200 +        paddw           xmm0,       xmm5
   1.201 +        movd            [rcx],      xmm0
   1.202 +;-
   1.203 +        movq            xmm0,       xmm6
   1.204 +        psrldq          xmm6,       8
   1.205 +
   1.206 +        paddw           xmm0,       xmm6
   1.207 +        movd            [rcx+4],    xmm0
   1.208 +;-
   1.209 +        movq            xmm0,       xmm7
   1.210 +        psrldq          xmm7,       8
   1.211 +
   1.212 +        paddw           xmm0,       xmm7
   1.213 +        movd            [rcx+8],    xmm0
   1.214 +
   1.215 +    STACK_FRAME_DESTROY_X3
   1.216 +
   1.217 +;void int vp9_sad16x8x3_sse3(
   1.218 +;    unsigned char *src_ptr,
   1.219 +;    int  src_stride,
   1.220 +;    unsigned char *ref_ptr,
   1.221 +;    int  ref_stride,
   1.222 +;    int  *results)
   1.223 +global sym(vp9_sad16x8x3_sse3) PRIVATE
   1.224 +sym(vp9_sad16x8x3_sse3):
   1.225 +
   1.226 +    STACK_FRAME_CREATE_X3
   1.227 +
   1.228 +        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
   1.229 +        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   1.230 +        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   1.231 +        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
   1.232 +
   1.233 +        mov             rcx,        result_ptr
   1.234 +
   1.235 +        movq            xmm0,       xmm5
   1.236 +        psrldq          xmm5,       8
   1.237 +
   1.238 +        paddw           xmm0,       xmm5
   1.239 +        movd            [rcx],      xmm0
   1.240 +;-
   1.241 +        movq            xmm0,       xmm6
   1.242 +        psrldq          xmm6,       8
   1.243 +
   1.244 +        paddw           xmm0,       xmm6
   1.245 +        movd            [rcx+4],    xmm0
   1.246 +;-
   1.247 +        movq            xmm0,       xmm7
   1.248 +        psrldq          xmm7,       8
   1.249 +
   1.250 +        paddw           xmm0,       xmm7
   1.251 +        movd            [rcx+8],    xmm0
   1.252 +
   1.253 +    STACK_FRAME_DESTROY_X3
   1.254 +
   1.255 +;void int vp9_sad8x16x3_sse3(
   1.256 +;    unsigned char *src_ptr,
   1.257 +;    int  src_stride,
   1.258 +;    unsigned char *ref_ptr,
   1.259 +;    int  ref_stride,
   1.260 +;    int  *results)
   1.261 +global sym(vp9_sad8x16x3_sse3) PRIVATE
   1.262 +sym(vp9_sad8x16x3_sse3):
   1.263 +
   1.264 +    STACK_FRAME_CREATE_X3
   1.265 +
   1.266 +        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
   1.267 +        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   1.268 +        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   1.269 +        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   1.270 +        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   1.271 +        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   1.272 +        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   1.273 +        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
   1.274 +
   1.275 +        mov             rcx,        result_ptr
   1.276 +
   1.277 +        punpckldq       mm5,        mm6
   1.278 +
   1.279 +        movq            [rcx],      mm5
   1.280 +        movd            [rcx+8],    mm7
   1.281 +
   1.282 +    STACK_FRAME_DESTROY_X3
   1.283 +
   1.284 +;void int vp9_sad8x8x3_sse3(
   1.285 +;    unsigned char *src_ptr,
   1.286 +;    int  src_stride,
   1.287 +;    unsigned char *ref_ptr,
   1.288 +;    int  ref_stride,
   1.289 +;    int  *results)
   1.290 +global sym(vp9_sad8x8x3_sse3) PRIVATE
   1.291 +sym(vp9_sad8x8x3_sse3):
   1.292 +
   1.293 +    STACK_FRAME_CREATE_X3
   1.294 +
   1.295 +        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
   1.296 +        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   1.297 +        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
   1.298 +        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
   1.299 +
   1.300 +        mov             rcx,        result_ptr
   1.301 +
   1.302 +        punpckldq       mm5,        mm6
   1.303 +
   1.304 +        movq            [rcx],      mm5
   1.305 +        movd            [rcx+8],    mm7
   1.306 +
   1.307 +    STACK_FRAME_DESTROY_X3
   1.308 +
   1.309 +;void int vp9_sad4x4x3_sse3(
   1.310 +;    unsigned char *src_ptr,
   1.311 +;    int  src_stride,
   1.312 +;    unsigned char *ref_ptr,
   1.313 +;    int  ref_stride,
   1.314 +;    int  *results)
   1.315 +global sym(vp9_sad4x4x3_sse3) PRIVATE
   1.316 +sym(vp9_sad4x4x3_sse3):
   1.317 +
   1.318 +    STACK_FRAME_CREATE_X3
   1.319 +
   1.320 +        movd            mm0,        DWORD PTR [src_ptr]
   1.321 +        movd            mm1,        DWORD PTR [ref_ptr]
   1.322 +
   1.323 +        movd            mm2,        DWORD PTR [src_ptr+src_stride]
   1.324 +        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
   1.325 +
   1.326 +        punpcklbw       mm0,        mm2
   1.327 +        punpcklbw       mm1,        mm3
   1.328 +
   1.329 +        movd            mm4,        DWORD PTR [ref_ptr+1]
   1.330 +        movd            mm5,        DWORD PTR [ref_ptr+2]
   1.331 +
   1.332 +        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
   1.333 +        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
   1.334 +
   1.335 +        psadbw          mm1,        mm0
   1.336 +
   1.337 +        punpcklbw       mm4,        mm2
   1.338 +        punpcklbw       mm5,        mm3
   1.339 +
   1.340 +        psadbw          mm4,        mm0
   1.341 +        psadbw          mm5,        mm0
   1.342 +
   1.343 +        lea             src_ptr,    [src_ptr+src_stride*2]
   1.344 +        lea             ref_ptr,    [ref_ptr+ref_stride*2]
   1.345 +
   1.346 +        movd            mm0,        DWORD PTR [src_ptr]
   1.347 +        movd            mm2,        DWORD PTR [ref_ptr]
   1.348 +
   1.349 +        movd            mm3,        DWORD PTR [src_ptr+src_stride]
   1.350 +        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
   1.351 +
   1.352 +        punpcklbw       mm0,        mm3
   1.353 +        punpcklbw       mm2,        mm6
   1.354 +
   1.355 +        movd            mm3,        DWORD PTR [ref_ptr+1]
   1.356 +        movd            mm7,        DWORD PTR [ref_ptr+2]
   1.357 +
   1.358 +        psadbw          mm2,        mm0
   1.359 +
   1.360 +        paddw           mm1,        mm2
   1.361 +
   1.362 +        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
   1.363 +        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
   1.364 +
   1.365 +        punpcklbw       mm3,        mm2
   1.366 +        punpcklbw       mm7,        mm6
   1.367 +
   1.368 +        psadbw          mm3,        mm0
   1.369 +        psadbw          mm7,        mm0
   1.370 +
   1.371 +        paddw           mm3,        mm4
   1.372 +        paddw           mm7,        mm5
   1.373 +
   1.374 +        mov             rcx,        result_ptr
   1.375 +
   1.376 +        punpckldq       mm1,        mm3
   1.377 +
   1.378 +        movq            [rcx],      mm1
   1.379 +        movd            [rcx+8],    mm7
   1.380 +
   1.381 +    STACK_FRAME_DESTROY_X3

mercurial