media/libvpx/vp8/common/x86/iwalsh_mmx.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/common/x86/iwalsh_mmx.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,140 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +
    1.15 +%include "vpx_ports/x86_abi_support.asm"
    1.16 +
    1.17 +;void vp8_short_inv_walsh4x4_mmx(short *input, short *output)
    1.18 +global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE
    1.19 +sym(vp8_short_inv_walsh4x4_mmx):
    1.20 +    push        rbp
    1.21 +    mov         rbp, rsp
    1.22 +    SHADOW_ARGS_TO_STACK 2
    1.23 +    ; end prolog
    1.24 +
    1.25 +    mov         rdx, arg(0)
    1.26 +    mov         rax, 30003h
    1.27 +
    1.28 +    movq        mm0, [rdx + 0]    ;ip[0]
    1.29 +    movq        mm1, [rdx + 8]    ;ip[4]
    1.30 +    movq        mm7, rax
    1.31 +
    1.32 +    movq        mm2, [rdx + 16]   ;ip[8]
    1.33 +    movq        mm3, [rdx + 24]   ;ip[12]
    1.34 +    punpcklwd   mm7, mm7          ;0003000300030003h
    1.35 +    mov         rdx, arg(1)
    1.36 +
    1.37 +    movq        mm4, mm0
    1.38 +    movq        mm5, mm1
    1.39 +
    1.40 +    paddw       mm4, mm3          ;ip[0] + ip[12] aka al
    1.41 +    paddw       mm5, mm2          ;ip[4] + ip[8] aka bl
    1.42 +
    1.43 +    movq        mm6, mm4          ;temp al
    1.44 +    paddw       mm4, mm5          ;al + bl
    1.45 +    psubw       mm6, mm5          ;al - bl
    1.46 +
    1.47 +    psubw       mm0, mm3          ;ip[0] - ip[12] aka d1
    1.48 +    psubw       mm1, mm2          ;ip[4] - ip[8] aka c1
    1.49 +
    1.50 +    movq        mm5, mm0          ;temp dl
    1.51 +    paddw       mm0, mm1          ;dl + cl
    1.52 +    psubw       mm5, mm1          ;dl - cl
    1.53 +
    1.54 +    ; 03 02 01 00
    1.55 +    ; 13 12 11 10
    1.56 +    ; 23 22 21 20
    1.57 +    ; 33 32 31 30
    1.58 +
    1.59 +    movq        mm3, mm4          ; 03 02 01 00
    1.60 +    punpcklwd   mm4, mm0          ; 11 01 10 00
    1.61 +    punpckhwd   mm3, mm0          ; 13 03 12 02
    1.62 +
    1.63 +    movq        mm1, mm6          ; 23 22 21 20
    1.64 +    punpcklwd   mm6, mm5          ; 31 21 30 20
    1.65 +    punpckhwd   mm1, mm5          ; 33 23 32 22
    1.66 +
    1.67 +    movq        mm0, mm4          ; 11 01 10 00
    1.68 +    movq        mm2, mm3          ; 13 03 12 02
    1.69 +
    1.70 +    punpckldq   mm0, mm6          ; 30 20 10 00 aka ip[0]
    1.71 +    punpckhdq   mm4, mm6          ; 31 21 11 01 aka ip[4]
    1.72 +
    1.73 +    punpckldq   mm2, mm1          ; 32 22 12 02 aka ip[8]
    1.74 +    punpckhdq   mm3, mm1          ; 33 23 13 03 aka ip[12]
    1.75 +;~~~~~~~~~~~~~~~~~~~~~
    1.76 +    movq        mm1, mm0
    1.77 +    movq        mm5, mm4
    1.78 +    paddw       mm1, mm3          ;ip[0] + ip[12] aka al
    1.79 +    paddw       mm5, mm2          ;ip[4] + ip[8] aka bl
    1.80 +
    1.81 +    movq        mm6, mm1          ;temp al
    1.82 +    paddw       mm1, mm5          ;al + bl
    1.83 +    psubw       mm6, mm5          ;al - bl
    1.84 +    paddw       mm1, mm7
    1.85 +    paddw       mm6, mm7
    1.86 +    psraw       mm1, 3
    1.87 +    psraw       mm6, 3
    1.88 +
    1.89 +    psubw       mm0, mm3          ;ip[0] - ip[12] aka d1
    1.90 +    psubw       mm4, mm2          ;ip[4] - ip[8] aka c1
    1.91 +
    1.92 +    movq        mm5, mm0          ;temp dl
    1.93 +    paddw       mm0, mm4          ;dl + cl
    1.94 +    psubw       mm5, mm4          ;dl - cl
    1.95 +    paddw       mm0, mm7
    1.96 +    paddw       mm5, mm7
    1.97 +    psraw       mm0, 3
    1.98 +    psraw       mm5, 3
    1.99 +;~~~~~~~~~~~~~~~~~~~~~
   1.100 +
   1.101 +    movd        eax, mm1
   1.102 +    movd        ecx, mm0
   1.103 +    psrlq       mm0, 32
   1.104 +    psrlq       mm1, 32
   1.105 +    mov         word ptr[rdx+32*0], ax
   1.106 +    mov         word ptr[rdx+32*1], cx
   1.107 +    shr         eax, 16
   1.108 +    shr         ecx, 16
   1.109 +    mov         word ptr[rdx+32*4], ax
   1.110 +    mov         word ptr[rdx+32*5], cx
   1.111 +    movd        eax, mm1
   1.112 +    movd        ecx, mm0
   1.113 +    mov         word ptr[rdx+32*8], ax
   1.114 +    mov         word ptr[rdx+32*9], cx
   1.115 +    shr         eax, 16
   1.116 +    shr         ecx, 16
   1.117 +    mov         word ptr[rdx+32*12], ax
   1.118 +    mov         word ptr[rdx+32*13], cx
   1.119 +
   1.120 +    movd        eax, mm6
   1.121 +    movd        ecx, mm5
   1.122 +    psrlq       mm5, 32
   1.123 +    psrlq       mm6, 32
   1.124 +    mov         word ptr[rdx+32*2], ax
   1.125 +    mov         word ptr[rdx+32*3], cx
   1.126 +    shr         eax, 16
   1.127 +    shr         ecx, 16
   1.128 +    mov         word ptr[rdx+32*6], ax
   1.129 +    mov         word ptr[rdx+32*7], cx
   1.130 +    movd        eax, mm6
   1.131 +    movd        ecx, mm5
   1.132 +    mov         word ptr[rdx+32*10], ax
   1.133 +    mov         word ptr[rdx+32*11], cx
   1.134 +    shr         eax, 16
   1.135 +    shr         ecx, 16
   1.136 +    mov         word ptr[rdx+32*14], ax
   1.137 +    mov         word ptr[rdx+32*15], cx
   1.138 +
   1.139 +    ; begin epilog
   1.140 +    UNSHADOW_ARGS
   1.141 +    pop         rbp
   1.142 +    ret
   1.143 +

mercurial