media/libvpx/vp9/encoder/x86/vp9_sad_ssse3.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_sad_ssse3.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,370 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +
    1.15 +%include "vpx_ports/x86_abi_support.asm"
    1.16 +
    1.17 +%macro PROCESS_16X2X3 1
    1.18 +%if %1
    1.19 +        movdqa          xmm0,       XMMWORD PTR [rsi]
    1.20 +        lddqu           xmm5,       XMMWORD PTR [rdi]
    1.21 +        lddqu           xmm6,       XMMWORD PTR [rdi+1]
    1.22 +        lddqu           xmm7,       XMMWORD PTR [rdi+2]
    1.23 +
    1.24 +        psadbw          xmm5,       xmm0
    1.25 +        psadbw          xmm6,       xmm0
    1.26 +        psadbw          xmm7,       xmm0
    1.27 +%else
    1.28 +        movdqa          xmm0,       XMMWORD PTR [rsi]
    1.29 +        lddqu           xmm1,       XMMWORD PTR [rdi]
    1.30 +        lddqu           xmm2,       XMMWORD PTR [rdi+1]
    1.31 +        lddqu           xmm3,       XMMWORD PTR [rdi+2]
    1.32 +
    1.33 +        psadbw          xmm1,       xmm0
    1.34 +        psadbw          xmm2,       xmm0
    1.35 +        psadbw          xmm3,       xmm0
    1.36 +
    1.37 +        paddw           xmm5,       xmm1
    1.38 +        paddw           xmm6,       xmm2
    1.39 +        paddw           xmm7,       xmm3
    1.40 +%endif
    1.41 +        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
    1.42 +        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
    1.43 +        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
    1.44 +        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
    1.45 +
    1.46 +        lea             rsi,        [rsi+rax*2]
    1.47 +        lea             rdi,        [rdi+rdx*2]
    1.48 +
    1.49 +        psadbw          xmm1,       xmm0
    1.50 +        psadbw          xmm2,       xmm0
    1.51 +        psadbw          xmm3,       xmm0
    1.52 +
    1.53 +        paddw           xmm5,       xmm1
    1.54 +        paddw           xmm6,       xmm2
    1.55 +        paddw           xmm7,       xmm3
    1.56 +%endmacro
    1.57 +
    1.58 +%macro PROCESS_16X2X3_OFFSET 2
    1.59 +%if %1
    1.60 +        movdqa          xmm0,       XMMWORD PTR [rsi]
    1.61 +        movdqa          xmm4,       XMMWORD PTR [rdi]
    1.62 +        movdqa          xmm7,       XMMWORD PTR [rdi+16]
    1.63 +
    1.64 +        movdqa          xmm5,       xmm7
    1.65 +        palignr         xmm5,       xmm4,       %2
    1.66 +
    1.67 +        movdqa          xmm6,       xmm7
    1.68 +        palignr         xmm6,       xmm4,       (%2+1)
    1.69 +
    1.70 +        palignr         xmm7,       xmm4,       (%2+2)
    1.71 +
    1.72 +        psadbw          xmm5,       xmm0
    1.73 +        psadbw          xmm6,       xmm0
    1.74 +        psadbw          xmm7,       xmm0
    1.75 +%else
    1.76 +        movdqa          xmm0,       XMMWORD PTR [rsi]
    1.77 +        movdqa          xmm4,       XMMWORD PTR [rdi]
    1.78 +        movdqa          xmm3,       XMMWORD PTR [rdi+16]
    1.79 +
    1.80 +        movdqa          xmm1,       xmm3
    1.81 +        palignr         xmm1,       xmm4,       %2
    1.82 +
    1.83 +        movdqa          xmm2,       xmm3
    1.84 +        palignr         xmm2,       xmm4,       (%2+1)
    1.85 +
    1.86 +        palignr         xmm3,       xmm4,       (%2+2)
    1.87 +
    1.88 +        psadbw          xmm1,       xmm0
    1.89 +        psadbw          xmm2,       xmm0
    1.90 +        psadbw          xmm3,       xmm0
    1.91 +
    1.92 +        paddw           xmm5,       xmm1
    1.93 +        paddw           xmm6,       xmm2
    1.94 +        paddw           xmm7,       xmm3
    1.95 +%endif
    1.96 +        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
    1.97 +        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
    1.98 +        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
    1.99 +
   1.100 +        movdqa          xmm1,       xmm3
   1.101 +        palignr         xmm1,       xmm4,       %2
   1.102 +
   1.103 +        movdqa          xmm2,       xmm3
   1.104 +        palignr         xmm2,       xmm4,       (%2+1)
   1.105 +
   1.106 +        palignr         xmm3,       xmm4,       (%2+2)
   1.107 +
   1.108 +        lea             rsi,        [rsi+rax*2]
   1.109 +        lea             rdi,        [rdi+rdx*2]
   1.110 +
   1.111 +        psadbw          xmm1,       xmm0
   1.112 +        psadbw          xmm2,       xmm0
   1.113 +        psadbw          xmm3,       xmm0
   1.114 +
   1.115 +        paddw           xmm5,       xmm1
   1.116 +        paddw           xmm6,       xmm2
   1.117 +        paddw           xmm7,       xmm3
   1.118 +%endmacro
   1.119 +
   1.120 +%macro PROCESS_16X16X3_OFFSET 2
   1.121 +%2_aligned_by_%1:
   1.122 +
   1.123 +        sub             rdi,        %1
   1.124 +
   1.125 +        PROCESS_16X2X3_OFFSET 1, %1
   1.126 +        PROCESS_16X2X3_OFFSET 0, %1
   1.127 +        PROCESS_16X2X3_OFFSET 0, %1
   1.128 +        PROCESS_16X2X3_OFFSET 0, %1
   1.129 +        PROCESS_16X2X3_OFFSET 0, %1
   1.130 +        PROCESS_16X2X3_OFFSET 0, %1
   1.131 +        PROCESS_16X2X3_OFFSET 0, %1
   1.132 +        PROCESS_16X2X3_OFFSET 0, %1
   1.133 +
   1.134 +        jmp             %2_store_off
   1.135 +
   1.136 +%endmacro
   1.137 +
   1.138 +%macro PROCESS_16X8X3_OFFSET 2
   1.139 +%2_aligned_by_%1:
   1.140 +
   1.141 +        sub             rdi,        %1
   1.142 +
   1.143 +        PROCESS_16X2X3_OFFSET 1, %1
   1.144 +        PROCESS_16X2X3_OFFSET 0, %1
   1.145 +        PROCESS_16X2X3_OFFSET 0, %1
   1.146 +        PROCESS_16X2X3_OFFSET 0, %1
   1.147 +
   1.148 +        jmp             %2_store_off
   1.149 +
   1.150 +%endmacro
   1.151 +
   1.152 +;void int vp9_sad16x16x3_ssse3(
   1.153 +;    unsigned char *src_ptr,
   1.154 +;    int  src_stride,
   1.155 +;    unsigned char *ref_ptr,
   1.156 +;    int  ref_stride,
   1.157 +;    int  *results)
   1.158 +global sym(vp9_sad16x16x3_ssse3) PRIVATE
   1.159 +sym(vp9_sad16x16x3_ssse3):
   1.160 +    push        rbp
   1.161 +    mov         rbp, rsp
   1.162 +    SHADOW_ARGS_TO_STACK 5
   1.163 +    SAVE_XMM 7
   1.164 +    push        rsi
   1.165 +    push        rdi
   1.166 +    push        rcx
   1.167 +    ; end prolog
   1.168 +
   1.169 +        mov             rsi,        arg(0) ;src_ptr
   1.170 +        mov             rdi,        arg(2) ;ref_ptr
   1.171 +
   1.172 +        mov             rdx,        0xf
   1.173 +        and             rdx,        rdi
   1.174 +
   1.175 +        jmp .vp9_sad16x16x3_ssse3_skiptable
   1.176 +.vp9_sad16x16x3_ssse3_jumptable:
   1.177 +        dd .vp9_sad16x16x3_ssse3_aligned_by_0  - .vp9_sad16x16x3_ssse3_do_jump
   1.178 +        dd .vp9_sad16x16x3_ssse3_aligned_by_1  - .vp9_sad16x16x3_ssse3_do_jump
   1.179 +        dd .vp9_sad16x16x3_ssse3_aligned_by_2  - .vp9_sad16x16x3_ssse3_do_jump
   1.180 +        dd .vp9_sad16x16x3_ssse3_aligned_by_3  - .vp9_sad16x16x3_ssse3_do_jump
   1.181 +        dd .vp9_sad16x16x3_ssse3_aligned_by_4  - .vp9_sad16x16x3_ssse3_do_jump
   1.182 +        dd .vp9_sad16x16x3_ssse3_aligned_by_5  - .vp9_sad16x16x3_ssse3_do_jump
   1.183 +        dd .vp9_sad16x16x3_ssse3_aligned_by_6  - .vp9_sad16x16x3_ssse3_do_jump
   1.184 +        dd .vp9_sad16x16x3_ssse3_aligned_by_7  - .vp9_sad16x16x3_ssse3_do_jump
   1.185 +        dd .vp9_sad16x16x3_ssse3_aligned_by_8  - .vp9_sad16x16x3_ssse3_do_jump
   1.186 +        dd .vp9_sad16x16x3_ssse3_aligned_by_9  - .vp9_sad16x16x3_ssse3_do_jump
   1.187 +        dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump
   1.188 +        dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump
   1.189 +        dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump
   1.190 +        dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump
   1.191 +        dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump
   1.192 +        dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump
   1.193 +.vp9_sad16x16x3_ssse3_skiptable:
   1.194 +
   1.195 +        call .vp9_sad16x16x3_ssse3_do_jump
   1.196 +.vp9_sad16x16x3_ssse3_do_jump:
   1.197 +        pop             rcx                         ; get the address of do_jump
   1.198 +        mov             rax,  .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_ssse3_do_jump
   1.199 +        add             rax,  rcx  ; get the absolute address of vp9_sad16x16x3_ssse3_jumptable
   1.200 +
   1.201 +        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
   1.202 +        add             rcx,        rax
   1.203 +
   1.204 +        movsxd          rax,        dword ptr arg(1) ;src_stride
   1.205 +        movsxd          rdx,        dword ptr arg(3) ;ref_stride
   1.206 +
   1.207 +        jmp             rcx
   1.208 +
   1.209 +        PROCESS_16X16X3_OFFSET 0,  .vp9_sad16x16x3_ssse3
   1.210 +        PROCESS_16X16X3_OFFSET 1,  .vp9_sad16x16x3_ssse3
   1.211 +        PROCESS_16X16X3_OFFSET 2,  .vp9_sad16x16x3_ssse3
   1.212 +        PROCESS_16X16X3_OFFSET 3,  .vp9_sad16x16x3_ssse3
   1.213 +        PROCESS_16X16X3_OFFSET 4,  .vp9_sad16x16x3_ssse3
   1.214 +        PROCESS_16X16X3_OFFSET 5,  .vp9_sad16x16x3_ssse3
   1.215 +        PROCESS_16X16X3_OFFSET 6,  .vp9_sad16x16x3_ssse3
   1.216 +        PROCESS_16X16X3_OFFSET 7,  .vp9_sad16x16x3_ssse3
   1.217 +        PROCESS_16X16X3_OFFSET 8,  .vp9_sad16x16x3_ssse3
   1.218 +        PROCESS_16X16X3_OFFSET 9,  .vp9_sad16x16x3_ssse3
   1.219 +        PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3
   1.220 +        PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3
   1.221 +        PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3
   1.222 +        PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3
   1.223 +        PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3
   1.224 +
   1.225 +.vp9_sad16x16x3_ssse3_aligned_by_15:
   1.226 +        PROCESS_16X2X3 1
   1.227 +        PROCESS_16X2X3 0
   1.228 +        PROCESS_16X2X3 0
   1.229 +        PROCESS_16X2X3 0
   1.230 +        PROCESS_16X2X3 0
   1.231 +        PROCESS_16X2X3 0
   1.232 +        PROCESS_16X2X3 0
   1.233 +        PROCESS_16X2X3 0
   1.234 +
   1.235 +.vp9_sad16x16x3_ssse3_store_off:
   1.236 +        mov             rdi,        arg(4) ;Results
   1.237 +
   1.238 +        movq            xmm0,       xmm5
   1.239 +        psrldq          xmm5,       8
   1.240 +
   1.241 +        paddw           xmm0,       xmm5
   1.242 +        movd            [rdi],      xmm0
   1.243 +;-
   1.244 +        movq            xmm0,       xmm6
   1.245 +        psrldq          xmm6,       8
   1.246 +
   1.247 +        paddw           xmm0,       xmm6
   1.248 +        movd            [rdi+4],    xmm0
   1.249 +;-
   1.250 +        movq            xmm0,       xmm7
   1.251 +        psrldq          xmm7,       8
   1.252 +
   1.253 +        paddw           xmm0,       xmm7
   1.254 +        movd            [rdi+8],    xmm0
   1.255 +
   1.256 +    ; begin epilog
   1.257 +    pop         rcx
   1.258 +    pop         rdi
   1.259 +    pop         rsi
   1.260 +    RESTORE_XMM
   1.261 +    UNSHADOW_ARGS
   1.262 +    pop         rbp
   1.263 +    ret
   1.264 +
   1.265 +;void int vp9_sad16x8x3_ssse3(
   1.266 +;    unsigned char *src_ptr,
   1.267 +;    int  src_stride,
   1.268 +;    unsigned char *ref_ptr,
   1.269 +;    int  ref_stride,
   1.270 +;    int  *results)
   1.271 +global sym(vp9_sad16x8x3_ssse3) PRIVATE
   1.272 +sym(vp9_sad16x8x3_ssse3):
   1.273 +    push        rbp
   1.274 +    mov         rbp, rsp
   1.275 +    SHADOW_ARGS_TO_STACK 5
   1.276 +    SAVE_XMM 7
   1.277 +    push        rsi
   1.278 +    push        rdi
   1.279 +    push        rcx
   1.280 +    ; end prolog
   1.281 +
   1.282 +        mov             rsi,        arg(0) ;src_ptr
   1.283 +        mov             rdi,        arg(2) ;ref_ptr
   1.284 +
   1.285 +        mov             rdx,        0xf
   1.286 +        and             rdx,        rdi
   1.287 +
   1.288 +        jmp .vp9_sad16x8x3_ssse3_skiptable
   1.289 +.vp9_sad16x8x3_ssse3_jumptable:
   1.290 +        dd .vp9_sad16x8x3_ssse3_aligned_by_0  - .vp9_sad16x8x3_ssse3_do_jump
   1.291 +        dd .vp9_sad16x8x3_ssse3_aligned_by_1  - .vp9_sad16x8x3_ssse3_do_jump
   1.292 +        dd .vp9_sad16x8x3_ssse3_aligned_by_2  - .vp9_sad16x8x3_ssse3_do_jump
   1.293 +        dd .vp9_sad16x8x3_ssse3_aligned_by_3  - .vp9_sad16x8x3_ssse3_do_jump
   1.294 +        dd .vp9_sad16x8x3_ssse3_aligned_by_4  - .vp9_sad16x8x3_ssse3_do_jump
   1.295 +        dd .vp9_sad16x8x3_ssse3_aligned_by_5  - .vp9_sad16x8x3_ssse3_do_jump
   1.296 +        dd .vp9_sad16x8x3_ssse3_aligned_by_6  - .vp9_sad16x8x3_ssse3_do_jump
   1.297 +        dd .vp9_sad16x8x3_ssse3_aligned_by_7  - .vp9_sad16x8x3_ssse3_do_jump
   1.298 +        dd .vp9_sad16x8x3_ssse3_aligned_by_8  - .vp9_sad16x8x3_ssse3_do_jump
   1.299 +        dd .vp9_sad16x8x3_ssse3_aligned_by_9  - .vp9_sad16x8x3_ssse3_do_jump
   1.300 +        dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump
   1.301 +        dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump
   1.302 +        dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump
   1.303 +        dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump
   1.304 +        dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump
   1.305 +        dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump
   1.306 +.vp9_sad16x8x3_ssse3_skiptable:
   1.307 +
   1.308 +        call .vp9_sad16x8x3_ssse3_do_jump
   1.309 +.vp9_sad16x8x3_ssse3_do_jump:
   1.310 +        pop             rcx                         ; get the address of do_jump
   1.311 +        mov             rax,  .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ssse3_do_jump
   1.312 +        add             rax,  rcx  ; get the absolute address of vp9_sad16x8x3_ssse3_jumptable
   1.313 +
   1.314 +        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
   1.315 +        add             rcx,        rax
   1.316 +
   1.317 +        movsxd          rax,        dword ptr arg(1) ;src_stride
   1.318 +        movsxd          rdx,        dword ptr arg(3) ;ref_stride
   1.319 +
   1.320 +        jmp             rcx
   1.321 +
   1.322 +        PROCESS_16X8X3_OFFSET 0,  .vp9_sad16x8x3_ssse3
   1.323 +        PROCESS_16X8X3_OFFSET 1,  .vp9_sad16x8x3_ssse3
   1.324 +        PROCESS_16X8X3_OFFSET 2,  .vp9_sad16x8x3_ssse3
   1.325 +        PROCESS_16X8X3_OFFSET 3,  .vp9_sad16x8x3_ssse3
   1.326 +        PROCESS_16X8X3_OFFSET 4,  .vp9_sad16x8x3_ssse3
   1.327 +        PROCESS_16X8X3_OFFSET 5,  .vp9_sad16x8x3_ssse3
   1.328 +        PROCESS_16X8X3_OFFSET 6,  .vp9_sad16x8x3_ssse3
   1.329 +        PROCESS_16X8X3_OFFSET 7,  .vp9_sad16x8x3_ssse3
   1.330 +        PROCESS_16X8X3_OFFSET 8,  .vp9_sad16x8x3_ssse3
   1.331 +        PROCESS_16X8X3_OFFSET 9,  .vp9_sad16x8x3_ssse3
   1.332 +        PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3
   1.333 +        PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3
   1.334 +        PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3
   1.335 +        PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3
   1.336 +        PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3
   1.337 +
   1.338 +.vp9_sad16x8x3_ssse3_aligned_by_15:
   1.339 +
   1.340 +        PROCESS_16X2X3 1
   1.341 +        PROCESS_16X2X3 0
   1.342 +        PROCESS_16X2X3 0
   1.343 +        PROCESS_16X2X3 0
   1.344 +
   1.345 +.vp9_sad16x8x3_ssse3_store_off:
   1.346 +        mov             rdi,        arg(4) ;Results
   1.347 +
   1.348 +        movq            xmm0,       xmm5
   1.349 +        psrldq          xmm5,       8
   1.350 +
   1.351 +        paddw           xmm0,       xmm5
   1.352 +        movd            [rdi],      xmm0
   1.353 +;-
   1.354 +        movq            xmm0,       xmm6
   1.355 +        psrldq          xmm6,       8
   1.356 +
   1.357 +        paddw           xmm0,       xmm6
   1.358 +        movd            [rdi+4],    xmm0
   1.359 +;-
   1.360 +        movq            xmm0,       xmm7
   1.361 +        psrldq          xmm7,       8
   1.362 +
   1.363 +        paddw           xmm0,       xmm7
   1.364 +        movd            [rdi+8],    xmm0
   1.365 +
   1.366 +    ; begin epilog
   1.367 +    pop         rcx
   1.368 +    pop         rdi
   1.369 +    pop         rsi
   1.370 +    RESTORE_XMM
   1.371 +    UNSHADOW_ARGS
   1.372 +    pop         rbp
   1.373 +    ret

mercurial