media/libvpx/vp8/common/x86/postproc_mmx.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/common/x86/postproc_mmx.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,313 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +
    1.15 +%include "vpx_ports/x86_abi_support.asm"
    1.16 +
    1.17 +%define VP8_FILTER_WEIGHT 128
    1.18 +%define VP8_FILTER_SHIFT  7
    1.19 +
    1.20 +;void vp8_mbpost_proc_down_mmx(unsigned char *dst,
    1.21 +;                             int pitch, int rows, int cols,int flimit)
    1.22 +extern sym(vp8_rv)
    1.23 +global sym(vp8_mbpost_proc_down_mmx) PRIVATE
    1.24 +sym(vp8_mbpost_proc_down_mmx):
    1.25 +    push        rbp
    1.26 +    mov         rbp, rsp
    1.27 +    SHADOW_ARGS_TO_STACK 5
    1.28 +    GET_GOT     rbx
    1.29 +    push        rsi
    1.30 +    push        rdi
    1.31 +    ; end prolog
    1.32 +
    1.33 +    ALIGN_STACK 16, rax
    1.34 +    sub         rsp, 136
    1.35 +
    1.36 +    ; unsigned char d[16][8] at [rsp]
    1.37 +    ; create flimit2 at [rsp+128]
    1.38 +    mov         eax, dword ptr arg(4) ;flimit
    1.39 +    mov         [rsp+128], eax
    1.40 +    mov         [rsp+128+4], eax
    1.41 +%define flimit2 [rsp+128]
    1.42 +
    1.43 +%if ABI_IS_32BIT=0
    1.44 +    lea         r8,       [GLOBAL(sym(vp8_rv))]
    1.45 +%endif
    1.46 +
    1.47 +    ;rows +=8;
    1.48 +    add         dword ptr arg(2), 8
    1.49 +
    1.50 +    ;for(c=0; c<cols; c+=4)
    1.51 +.loop_col:
    1.52 +            mov         rsi,        arg(0)  ;s
    1.53 +            pxor        mm0,        mm0     ;
    1.54 +
    1.55 +            movsxd      rax,        dword ptr arg(1) ;pitch       ;
    1.56 +
    1.57 +            ; this copies the last row down into the border 8 rows
    1.58 +            mov         rdi,        rsi
    1.59 +            mov         rdx,        arg(2)
    1.60 +            sub         rdx,        9
    1.61 +            imul        rdx,        rax
    1.62 +            lea         rdi,        [rdi+rdx]
    1.63 +            movq        mm1,        QWORD ptr[rdi]              ; first row
    1.64 +            mov         rcx,        8
    1.65 +.init_borderd                                                    ; initialize borders
    1.66 +            lea         rdi,        [rdi + rax]
    1.67 +            movq        [rdi],      mm1
    1.68 +
    1.69 +            dec         rcx
    1.70 +            jne         .init_borderd
    1.71 +
    1.72 +            neg         rax                                     ; rax = -pitch
    1.73 +
    1.74 +            ; this copies the first row up into the border 8 rows
    1.75 +            mov         rdi,        rsi
    1.76 +            movq        mm1,        QWORD ptr[rdi]              ; first row
    1.77 +            mov         rcx,        8
    1.78 +.init_border                                                    ; initialize borders
    1.79 +            lea         rdi,        [rdi + rax]
    1.80 +            movq        [rdi],      mm1
    1.81 +
    1.82 +            dec         rcx
    1.83 +            jne         .init_border
    1.84 +
    1.85 +
    1.86 +            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
    1.87 +            neg         rax
    1.88 +
    1.89 +
    1.90 +            pxor        mm5,        mm5
    1.91 +            pxor        mm6,        mm6     ;
    1.92 +
    1.93 +            pxor        mm7,        mm7     ;
    1.94 +            mov         rdi,        rsi
    1.95 +
    1.96 +            mov         rcx,        15          ;
    1.97 +
    1.98 +.loop_initvar:
    1.99 +            movd        mm1,        DWORD PTR [rdi];
   1.100 +            punpcklbw   mm1,        mm0     ;
   1.101 +
   1.102 +            paddw       mm5,        mm1     ;
   1.103 +            pmullw      mm1,        mm1     ;
   1.104 +
   1.105 +            movq        mm2,        mm1     ;
   1.106 +            punpcklwd   mm1,        mm0     ;
   1.107 +
   1.108 +            punpckhwd   mm2,        mm0     ;
   1.109 +            paddd       mm6,        mm1     ;
   1.110 +
   1.111 +            paddd       mm7,        mm2     ;
   1.112 +            lea         rdi,        [rdi+rax]   ;
   1.113 +
   1.114 +            dec         rcx
   1.115 +            jne         .loop_initvar
   1.116 +            ;save the var and sum
   1.117 +            xor         rdx,        rdx
   1.118 +.loop_row:
   1.119 +            movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
   1.120 +            movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
   1.121 +
   1.122 +            punpcklbw   mm1,        mm0
   1.123 +            punpcklbw   mm2,        mm0
   1.124 +
   1.125 +            paddw       mm5,        mm2
   1.126 +            psubw       mm5,        mm1
   1.127 +
   1.128 +            pmullw      mm2,        mm2
   1.129 +            movq        mm4,        mm2
   1.130 +
   1.131 +            punpcklwd   mm2,        mm0
   1.132 +            punpckhwd   mm4,        mm0
   1.133 +
   1.134 +            paddd       mm6,        mm2
   1.135 +            paddd       mm7,        mm4
   1.136 +
   1.137 +            pmullw      mm1,        mm1
   1.138 +            movq        mm2,        mm1
   1.139 +
   1.140 +            punpcklwd   mm1,        mm0
   1.141 +            psubd       mm6,        mm1
   1.142 +
   1.143 +            punpckhwd   mm2,        mm0
   1.144 +            psubd       mm7,        mm2
   1.145 +
   1.146 +
   1.147 +            movq        mm3,        mm6
   1.148 +            pslld       mm3,        4
   1.149 +
   1.150 +            psubd       mm3,        mm6
   1.151 +            movq        mm1,        mm5
   1.152 +
   1.153 +            movq        mm4,        mm5
   1.154 +            pmullw      mm1,        mm1
   1.155 +
   1.156 +            pmulhw      mm4,        mm4
   1.157 +            movq        mm2,        mm1
   1.158 +
   1.159 +            punpcklwd   mm1,        mm4
   1.160 +            punpckhwd   mm2,        mm4
   1.161 +
   1.162 +            movq        mm4,        mm7
   1.163 +            pslld       mm4,        4
   1.164 +
   1.165 +            psubd       mm4,        mm7
   1.166 +
   1.167 +            psubd       mm3,        mm1
   1.168 +            psubd       mm4,        mm2
   1.169 +
   1.170 +            psubd       mm3,        flimit2
   1.171 +            psubd       mm4,        flimit2
   1.172 +
   1.173 +            psrad       mm3,        31
   1.174 +            psrad       mm4,        31
   1.175 +
   1.176 +            packssdw    mm3,        mm4
   1.177 +            packsswb    mm3,        mm0
   1.178 +
   1.179 +            movd        mm1,        DWORD PTR [rsi+rax*8]
   1.180 +
   1.181 +            movq        mm2,        mm1
   1.182 +            punpcklbw   mm1,        mm0
   1.183 +
   1.184 +            paddw       mm1,        mm5
   1.185 +            mov         rcx,        rdx
   1.186 +
   1.187 +            and         rcx,        127
   1.188 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1
   1.189 +            push        rax
   1.190 +            lea         rax,        [GLOBAL(sym(vp8_rv))]
   1.191 +            movq        mm4,        [rax + rcx*2] ;vp8_rv[rcx*2]
   1.192 +            pop         rax
   1.193 +%elif ABI_IS_32BIT=0
   1.194 +            movq        mm4,        [r8 + rcx*2] ;vp8_rv[rcx*2]
   1.195 +%else
   1.196 +            movq        mm4,        [sym(vp8_rv) + rcx*2]
   1.197 +%endif
   1.198 +            paddw       mm1,        mm4
   1.199 +            psraw       mm1,        4
   1.200 +
   1.201 +            packuswb    mm1,        mm0
   1.202 +            pand        mm1,        mm3
   1.203 +
   1.204 +            pandn       mm3,        mm2
   1.205 +            por         mm1,        mm3
   1.206 +
   1.207 +            and         rcx,        15
   1.208 +            movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]
   1.209 +
   1.210 +            mov         rcx,        rdx
   1.211 +            sub         rcx,        8
   1.212 +
   1.213 +            and         rcx,        15
   1.214 +            movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]
   1.215 +
   1.216 +            movd        [rsi],      mm1
   1.217 +            lea         rsi,        [rsi+rax]
   1.218 +
   1.219 +            lea         rdi,        [rdi+rax]
   1.220 +            add         rdx,        1
   1.221 +
   1.222 +            cmp         edx,        dword arg(2) ;rows
   1.223 +            jl          .loop_row
   1.224 +
   1.225 +
   1.226 +        add         dword arg(0), 4 ; s += 4
   1.227 +        sub         dword arg(3), 4 ; cols -= 4
   1.228 +        cmp         dword arg(3), 0
   1.229 +        jg          .loop_col
   1.230 +
   1.231 +    add         rsp, 136
   1.232 +    pop         rsp
   1.233 +
   1.234 +    ; begin epilog
   1.235 +    pop rdi
   1.236 +    pop rsi
   1.237 +    RESTORE_GOT
   1.238 +    UNSHADOW_ARGS
   1.239 +    pop         rbp
   1.240 +    ret
   1.241 +%undef flimit2
   1.242 +
   1.243 +
   1.244 +;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
   1.245 +;                            unsigned char blackclamp[16],
   1.246 +;                            unsigned char whiteclamp[16],
   1.247 +;                            unsigned char bothclamp[16],
   1.248 +;                            unsigned int Width, unsigned int Height, int Pitch)
   1.249 +extern sym(rand)
   1.250 +global sym(vp8_plane_add_noise_mmx) PRIVATE
   1.251 +sym(vp8_plane_add_noise_mmx):
   1.252 +    push        rbp
   1.253 +    mov         rbp, rsp
   1.254 +    SHADOW_ARGS_TO_STACK 8
   1.255 +    GET_GOT     rbx
   1.256 +    push        rsi
   1.257 +    push        rdi
   1.258 +    ; end prolog
   1.259 +
   1.260 +.addnoise_loop:
   1.261 +    call sym(rand) WRT_PLT
   1.262 +    mov     rcx, arg(1) ;noise
   1.263 +    and     rax, 0xff
   1.264 +    add     rcx, rax
   1.265 +
   1.266 +    ; we rely on the fact that the clamping vectors are stored contiguously
   1.267 +    ; in black/white/both order. Note that we have to reload this here because
   1.268 +    ; rdx could be trashed by rand()
   1.269 +    mov     rdx, arg(2) ; blackclamp
   1.270 +
   1.271 +
   1.272 +            mov     rdi, rcx
   1.273 +            movsxd  rcx, dword arg(5) ;[Width]
   1.274 +            mov     rsi, arg(0) ;Pos
   1.275 +            xor         rax,rax
   1.276 +
   1.277 +.addnoise_nextset:
   1.278 +            movq        mm1,[rsi+rax]         ; get the source
   1.279 +
   1.280 +            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
   1.281 +            paddusb     mm1, [rdx+32] ;bothclamp
   1.282 +            psubusb     mm1, [rdx+16] ;whiteclamp
   1.283 +
   1.284 +            movq        mm2,[rdi+rax]         ; get the noise for this line
   1.285 +            paddb       mm1,mm2              ; add it in
   1.286 +            movq        [rsi+rax],mm1         ; store the result
   1.287 +
   1.288 +            add         rax,8                 ; move to the next line
   1.289 +
   1.290 +            cmp         rax, rcx
   1.291 +            jl          .addnoise_nextset
   1.292 +
   1.293 +    movsxd  rax, dword arg(7) ; Pitch
   1.294 +    add     arg(0), rax ; Start += Pitch
   1.295 +    sub     dword arg(6), 1   ; Height -= 1
   1.296 +    jg      .addnoise_loop
   1.297 +
   1.298 +    ; begin epilog
   1.299 +    pop rdi
   1.300 +    pop rsi
   1.301 +    RESTORE_GOT
   1.302 +    UNSHADOW_ARGS
   1.303 +    pop         rbp
   1.304 +    ret
   1.305 +
   1.306 +
   1.307 +SECTION_RODATA
   1.308 +align 16
   1.309 +Blur:
   1.310 +    times 16 dw 16
   1.311 +    times  8 dw 64
   1.312 +    times 16 dw 16
   1.313 +    times  8 dw  0
   1.314 +
   1.315 +rd:
   1.316 +    times 4 dw 0x40

mercurial