media/libvpx/vp8/encoder/x86/dct_mmx.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/encoder/x86/dct_mmx.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,241 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +
    1.15 +%include "vpx_ports/x86_abi_support.asm"
    1.16 +
    1.17 +;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
    1.18 +global sym(vp8_short_fdct4x4_mmx) PRIVATE
    1.19 +sym(vp8_short_fdct4x4_mmx):
    1.20 +    push        rbp
    1.21 +    mov         rbp,        rsp
    1.22 +    SHADOW_ARGS_TO_STACK 3
    1.23 +    GET_GOT     rbx
    1.24 +    push        rsi
    1.25 +    push        rdi
    1.26 +    ; end prolog
    1.27 +
    1.28 +        mov         rsi,        arg(0)      ; input
    1.29 +        mov         rdi,        arg(1)      ; output
    1.30 +
    1.31 +        movsxd      rax,        dword ptr arg(2) ;pitch
    1.32 +
    1.33 +        lea         rcx,        [rsi + rax*2]
    1.34 +        ; read the input data
    1.35 +        movq        mm0,        [rsi]
    1.36 +        movq        mm1,        [rsi + rax]
    1.37 +
    1.38 +        movq        mm2,        [rcx]
    1.39 +        movq        mm4,        [rcx + rax]
    1.40 +
    1.41 +        ; transpose for the first stage
    1.42 +        movq        mm3,        mm0         ; 00 01 02 03
    1.43 +        movq        mm5,        mm2         ; 20 21 22 23
    1.44 +
    1.45 +        punpcklwd   mm0,        mm1         ; 00 10 01 11
    1.46 +        punpckhwd   mm3,        mm1         ; 02 12 03 13
    1.47 +
    1.48 +        punpcklwd   mm2,        mm4         ; 20 30 21 31
    1.49 +        punpckhwd   mm5,        mm4         ; 22 32 23 33
    1.50 +
    1.51 +        movq        mm1,        mm0         ; 00 10 01 11
    1.52 +        punpckldq   mm0,        mm2         ; 00 10 20 30
    1.53 +
    1.54 +        punpckhdq   mm1,        mm2         ; 01 11 21 31
    1.55 +
    1.56 +        movq        mm2,        mm3         ; 02 12 03 13
    1.57 +        punpckldq   mm2,        mm5         ; 02 12 22 32
    1.58 +
    1.59 +        punpckhdq   mm3,        mm5         ; 03 13 23 33
    1.60 +
    1.61 +        ; mm0 0
    1.62 +        ; mm1 1
    1.63 +        ; mm2 2
    1.64 +        ; mm3 3
    1.65 +
    1.66 +        ; first stage
    1.67 +        movq        mm5,        mm0
    1.68 +        movq        mm4,        mm1
    1.69 +
    1.70 +        paddw       mm0,        mm3         ; a1 = 0 + 3
    1.71 +        paddw       mm1,        mm2         ; b1 = 1 + 2
    1.72 +
    1.73 +        psubw       mm4,        mm2         ; c1 = 1 - 2
    1.74 +        psubw       mm5,        mm3         ; d1 = 0 - 3
    1.75 +
    1.76 +        psllw       mm5,        3
    1.77 +        psllw       mm4,        3
    1.78 +
    1.79 +        psllw       mm0,        3
    1.80 +        psllw       mm1,        3
    1.81 +
    1.82 +        ; output 0 and 2
    1.83 +        movq        mm2,        mm0         ; a1
    1.84 +
    1.85 +        paddw       mm0,        mm1         ; op[0] = a1 + b1
    1.86 +        psubw       mm2,        mm1         ; op[2] = a1 - b1
    1.87 +
    1.88 +        ; output 1 and 3
    1.89 +        ; interleave c1, d1
    1.90 +        movq        mm1,        mm5         ; d1
    1.91 +        punpcklwd   mm1,        mm4         ; c1 d1
    1.92 +        punpckhwd   mm5,        mm4         ; c1 d1
    1.93 +
    1.94 +        movq        mm3,        mm1
    1.95 +        movq        mm4,        mm5
    1.96 +
    1.97 +        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
    1.98 +        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
    1.99 +
   1.100 +        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
   1.101 +        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
   1.102 +
   1.103 +        paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]
   1.104 +        paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]
   1.105 +        paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]
   1.106 +        paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]
   1.107 +
   1.108 +        psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
   1.109 +        psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
   1.110 +        psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
   1.111 +        psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
   1.112 +
   1.113 +        packssdw    mm1,        mm4         ; op[1]
   1.114 +        packssdw    mm3,        mm5         ; op[3]
   1.115 +
   1.116 +        ; done with vertical
   1.117 +        ; transpose for the second stage
   1.118 +        movq        mm4,        mm0         ; 00 10 20 30
   1.119 +        movq        mm5,        mm2         ; 02 12 22 32
   1.120 +
   1.121 +        punpcklwd   mm0,        mm1         ; 00 01 10 11
   1.122 +        punpckhwd   mm4,        mm1         ; 20 21 30 31
   1.123 +
   1.124 +        punpcklwd   mm2,        mm3         ; 02 03 12 13
   1.125 +        punpckhwd   mm5,        mm3         ; 22 23 32 33
   1.126 +
   1.127 +        movq        mm1,        mm0         ; 00 01 10 11
   1.128 +        punpckldq   mm0,        mm2         ; 00 01 02 03
   1.129 +
   1.130 +        punpckhdq   mm1,        mm2         ; 01 22 12 13
   1.131 +
   1.132 +        movq        mm2,        mm4         ; 20 31 30 31
   1.133 +        punpckldq   mm2,        mm5         ; 20 21 22 23
   1.134 +
   1.135 +        punpckhdq   mm4,        mm5         ; 30 31 32 33
   1.136 +
   1.137 +        ; mm0 0
   1.138 +        ; mm1 1
   1.139 +        ; mm2 2
   1.140 +        ; mm3 4
   1.141 +
   1.142 +        movq        mm5,        mm0
   1.143 +        movq        mm3,        mm1
   1.144 +
   1.145 +        paddw       mm0,        mm4         ; a1 = 0 + 3
   1.146 +        paddw       mm1,        mm2         ; b1 = 1 + 2
   1.147 +
   1.148 +        psubw       mm3,        mm2         ; c1 = 1 - 2
   1.149 +        psubw       mm5,        mm4         ; d1 = 0 - 3
   1.150 +
   1.151 +        pxor        mm6,        mm6         ; zero out for compare
   1.152 +
   1.153 +        pcmpeqw     mm6,        mm5         ; d1 != 0
   1.154 +
   1.155 +        pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,
   1.156 +                                                                ; and keep bit 0 of lower
   1.157 +
   1.158 +        ; output 0 and 2
   1.159 +        movq        mm2,        mm0         ; a1
   1.160 +
   1.161 +        paddw       mm0,        mm1         ; a1 + b1
   1.162 +        psubw       mm2,        mm1         ; a1 - b1
   1.163 +
   1.164 +        paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]
   1.165 +        paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]
   1.166 +
   1.167 +        psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4
   1.168 +        psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4
   1.169 +
   1.170 +        movq        MMWORD PTR[rdi + 0 ],  mm0
   1.171 +        movq        MMWORD PTR[rdi + 16],  mm2
   1.172 +
   1.173 +        ; output 1 and 3
   1.174 +        ; interleave c1, d1
   1.175 +        movq        mm1,        mm5         ; d1
   1.176 +        punpcklwd   mm1,        mm3         ; c1 d1
   1.177 +        punpckhwd   mm5,        mm3         ; c1 d1
   1.178 +
   1.179 +        movq        mm3,        mm1
   1.180 +        movq        mm4,        mm5
   1.181 +
   1.182 +        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
   1.183 +        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
   1.184 +
   1.185 +        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
   1.186 +        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
   1.187 +
   1.188 +        paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]
   1.189 +        paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]
   1.190 +        paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]
   1.191 +        paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]
   1.192 +
   1.193 +        psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
   1.194 +        psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
   1.195 +        psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
   1.196 +        psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
   1.197 +
   1.198 +        packssdw    mm1,        mm4         ; op[4]
   1.199 +        packssdw    mm3,        mm5         ; op[12]
   1.200 +
   1.201 +        paddw       mm1,        mm6         ; op[4] += (d1!=0)
   1.202 +
   1.203 +        movq        MMWORD PTR[rdi + 8 ],  mm1
   1.204 +        movq        MMWORD PTR[rdi + 24],  mm3
   1.205 +
   1.206 +     ; begin epilog
   1.207 +    pop         rdi
   1.208 +    pop         rsi
   1.209 +    RESTORE_GOT
   1.210 +    UNSHADOW_ARGS
   1.211 +    pop         rbp
   1.212 +    ret
   1.213 +
   1.214 +SECTION_RODATA
   1.215 +align 8
   1.216 +_5352_2217:
   1.217 +    dw 5352
   1.218 +    dw 2217
   1.219 +    dw 5352
   1.220 +    dw 2217
   1.221 +align 8
   1.222 +_2217_neg5352:
   1.223 +    dw 2217
   1.224 +    dw -5352
   1.225 +    dw 2217
   1.226 +    dw -5352
   1.227 +align 8
   1.228 +_cmp_mask:
   1.229 +    times 4 dw 1
   1.230 +align 8
   1.231 +_7w:
   1.232 +    times 4 dw 7
   1.233 +align 8
   1.234 +_14500:
   1.235 +    times 2 dd 14500
   1.236 +align 8
   1.237 +_7500:
   1.238 +    times 2 dd 7500
   1.239 +align 8
   1.240 +_12000:
   1.241 +    times 2 dd 12000
   1.242 +align 8
   1.243 +_51000:
   1.244 +    times 2 dd 51000

mercurial