media/libvpx/vp8/encoder/x86/fwalsh_sse2.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 ;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
    15 global sym(vp8_short_walsh4x4_sse2) PRIVATE
    16 sym(vp8_short_walsh4x4_sse2):
    17     push        rbp
    18     mov         rbp, rsp
    19     SHADOW_ARGS_TO_STACK 3
    20     SAVE_XMM 7
    21     GET_GOT     rbx
    22     push        rsi
    23     push        rdi
    24     ; end prolog
    26     mov     rsi, arg(0)           ; input
    27     mov     rdi, arg(1)           ; output
    28     movsxd  rdx, dword ptr arg(2) ; pitch
    30     ; first for loop
    31     movq    xmm0, MMWORD PTR [rsi]           ; load input
    32     movq    xmm1, MMWORD PTR [rsi + rdx]
    33     lea     rsi,  [rsi + rdx*2]
    34     movq    xmm2, MMWORD PTR [rsi]
    35     movq    xmm3, MMWORD PTR [rsi + rdx]
    37     punpcklwd xmm0,  xmm1
    38     punpcklwd xmm2,  xmm3
    40     movdqa    xmm1, xmm0
    41     punpckldq xmm0, xmm2           ; ip[1] ip[0]
    42     punpckhdq xmm1, xmm2           ; ip[3] ip[2]
    44     movdqa    xmm2, xmm0
    45     paddw     xmm0, xmm1
    46     psubw     xmm2, xmm1
    48     psllw     xmm0, 2              ; d1  a1
    49     psllw     xmm2, 2              ; c1  b1
    51     movdqa    xmm1, xmm0
    52     punpcklqdq xmm0, xmm2          ; b1  a1
    53     punpckhqdq xmm1, xmm2          ; c1  d1
    55     pxor      xmm6, xmm6
    56     movq      xmm6, xmm0
    57     pxor      xmm7, xmm7
    58     pcmpeqw   xmm7, xmm6
    59     paddw     xmm7, [GLOBAL(c1)]
    61     movdqa    xmm2, xmm0
    62     paddw     xmm0, xmm1           ; b1+c1  a1+d1
    63     psubw     xmm2, xmm1           ; b1-c1  a1-d1
    64     paddw     xmm0, xmm7           ; b1+c1  a1+d1+(a1!=0)
    66     ; second for loop
    67     ; input: 13  9  5  1 12  8  4  0 (xmm0)
    68     ;        14 10  6  2 15 11  7  3 (xmm2)
    69     ; after shuffle:
    70     ;        13  5  9  1 12  4  8  0 (xmm0)
    71     ;        14  6 10  2 15  7 11  3 (xmm1)
    72     pshuflw   xmm3, xmm0, 0xd8
    73     pshufhw   xmm0, xmm3, 0xd8
    74     pshuflw   xmm3, xmm2, 0xd8
    75     pshufhw   xmm1, xmm3, 0xd8
    77     movdqa    xmm2, xmm0
    78     pmaddwd   xmm0, [GLOBAL(c1)]    ; d11 a11 d10 a10
    79     pmaddwd   xmm2, [GLOBAL(cn1)]   ; c11 b11 c10 b10
    80     movdqa    xmm3, xmm1
    81     pmaddwd   xmm1, [GLOBAL(c1)]    ; d12 a12 d13 a13
    82     pmaddwd   xmm3, [GLOBAL(cn1)]   ; c12 b12 c13 b13
    84     pshufd    xmm4, xmm0, 0xd8      ; d11 d10 a11 a10
    85     pshufd    xmm5, xmm2, 0xd8      ; c11 c10 b11 b10
    86     pshufd    xmm6, xmm1, 0x72      ; d13 d12 a13 a12
    87     pshufd    xmm7, xmm3, 0x72      ; c13 c12 b13 b12
    89     movdqa    xmm0, xmm4
    90     punpcklqdq xmm0, xmm5           ; b11 b10 a11 a10
    91     punpckhqdq xmm4, xmm5           ; c11 c10 d11 d10
    92     movdqa    xmm1, xmm6
    93     punpcklqdq xmm1, xmm7           ; b13 b12 a13 a12
    94     punpckhqdq xmm6, xmm7           ; c13 c12 d13 d12
    96     movdqa    xmm2, xmm0
    97     paddd     xmm0, xmm4            ; b21 b20 a21 a20
    98     psubd     xmm2, xmm4            ; c21 c20 d21 d20
    99     movdqa    xmm3, xmm1
   100     paddd     xmm1, xmm6            ; b23 b22 a23 a22
   101     psubd     xmm3, xmm6            ; c23 c22 d23 d22
   103     pxor      xmm4, xmm4
   104     movdqa    xmm5, xmm4
   105     pcmpgtd   xmm4, xmm0
   106     pcmpgtd   xmm5, xmm2
   107     pand      xmm4, [GLOBAL(cd1)]
   108     pand      xmm5, [GLOBAL(cd1)]
   110     pxor      xmm6, xmm6
   111     movdqa    xmm7, xmm6
   112     pcmpgtd   xmm6, xmm1
   113     pcmpgtd   xmm7, xmm3
   114     pand      xmm6, [GLOBAL(cd1)]
   115     pand      xmm7, [GLOBAL(cd1)]
   117     paddd     xmm0, xmm4
   118     paddd     xmm2, xmm5
   119     paddd     xmm0, [GLOBAL(cd3)]
   120     paddd     xmm2, [GLOBAL(cd3)]
   121     paddd     xmm1, xmm6
   122     paddd     xmm3, xmm7
   123     paddd     xmm1, [GLOBAL(cd3)]
   124     paddd     xmm3, [GLOBAL(cd3)]
   126     psrad     xmm0, 3
   127     psrad     xmm1, 3
   128     psrad     xmm2, 3
   129     psrad     xmm3, 3
   130     movdqa    xmm4, xmm0
   131     punpcklqdq xmm0, xmm1           ; a23 a22 a21 a20
   132     punpckhqdq xmm4, xmm1           ; b23 b22 b21 b20
   133     movdqa    xmm5, xmm2
   134     punpckhqdq xmm2, xmm3           ; c23 c22 c21 c20
   135     punpcklqdq xmm5, xmm3           ; d23 d22 d21 d20
   137     packssdw  xmm0, xmm4            ; b23 b22 b21 b20 a23 a22 a21 a20
   138     packssdw  xmm2, xmm5            ; d23 d22 d21 d20 c23 c22 c21 c20
   140     movdqa  XMMWORD PTR [rdi], xmm0
   141     movdqa  XMMWORD PTR [rdi + 16], xmm2
   143     ; begin epilog
   144     pop rdi
   145     pop rsi
   146     RESTORE_GOT
   147     RESTORE_XMM
   148     UNSHADOW_ARGS
   149     pop         rbp
   150     ret
   152 SECTION_RODATA
   153 align 16
   154 c1:
   155     dw 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
   156 align 16
   157 cn1:
   158     dw 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff, 0x0001, 0xffff
   159 align 16
   160 cd1:
   161     dd 0x00000001, 0x00000001, 0x00000001, 0x00000001
   162 align 16
   163 cd3:
   164     dd 0x00000003, 0x00000003, 0x00000003, 0x00000003

mercurial