media/libvpx/vp9/common/x86/vp9_postproc_mmx.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp9/common/x86/vp9_postproc_mmx.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,534 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +
    1.15 +%include "vpx_ports/x86_abi_support.asm"
    1.16 +
    1.17 +%define VP9_FILTER_WEIGHT 128
    1.18 +%define VP9_FILTER_SHIFT  7
    1.19 +
    1.20 +;void vp9_post_proc_down_and_across_mmx
    1.21 +;(
    1.22 +;    unsigned char *src_ptr,
    1.23 +;    unsigned char *dst_ptr,
    1.24 +;    int src_pixels_per_line,
    1.25 +;    int dst_pixels_per_line,
    1.26 +;    int rows,
    1.27 +;    int cols,
    1.28 +;    int flimit
    1.29 +;)
    1.30 +global sym(vp9_post_proc_down_and_across_mmx) PRIVATE
    1.31 +sym(vp9_post_proc_down_and_across_mmx):
    1.32 +    push        rbp
    1.33 +    mov         rbp, rsp
    1.34 +    SHADOW_ARGS_TO_STACK 7
    1.35 +    GET_GOT     rbx
    1.36 +    push        rsi
    1.37 +    push        rdi
    1.38 +    ; end prolog
    1.39 +
    1.40 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1
    1.41 +    ; move the global rd onto the stack, since we don't have enough registers
    1.42 +    ; to do PIC addressing
    1.43 +    movq        mm0, [GLOBAL(rd)]
    1.44 +    sub         rsp, 8
    1.45 +    movq        [rsp], mm0
    1.46 +%define RD [rsp]
    1.47 +%else
    1.48 +%define RD [GLOBAL(rd)]
    1.49 +%endif
    1.50 +
    1.51 +        push        rbx
    1.52 +        lea         rbx, [GLOBAL(Blur)]
    1.53 +        movd        mm2, dword ptr arg(6) ;flimit
    1.54 +        punpcklwd   mm2, mm2
    1.55 +        punpckldq   mm2, mm2
    1.56 +
    1.57 +        mov         rsi,        arg(0) ;src_ptr
    1.58 +        mov         rdi,        arg(1) ;dst_ptr
    1.59 +
    1.60 +        movsxd      rcx, DWORD PTR arg(4) ;rows
    1.61 +        movsxd      rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
    1.62 +        pxor        mm0, mm0              ; mm0 = 00000000
    1.63 +
    1.64 +.nextrow:
    1.65 +
    1.66 +        xor         rdx,        rdx       ; clear out rdx for use as loop counter
    1.67 +.nextcol:
    1.68 +
    1.69 +        pxor        mm7, mm7              ; mm7 = 00000000
    1.70 +        movq        mm6, [rbx + 32 ]      ; mm6 = kernel 2 taps
    1.71 +        movq        mm3, [rsi]            ; mm4 = r0 p0..p7
    1.72 +        punpcklbw   mm3, mm0              ; mm3 = p0..p3
    1.73 +        movq        mm1, mm3              ; mm1 = p0..p3
    1.74 +        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
    1.75 +
    1.76 +        movq        mm6, [rbx + 48]       ; mm6 = kernel 3 taps
    1.77 +        movq        mm5, [rsi + rax]      ; mm4 = r1 p0..p7
    1.78 +        punpcklbw   mm5, mm0              ; mm5 = r1 p0..p3
    1.79 +        pmullw      mm6, mm5              ; mm6 *= p0..p3 * kernel 3 modifiers
    1.80 +        paddusw     mm3, mm6              ; mm3 += mm6
    1.81 +
    1.82 +        ; thresholding
    1.83 +        movq        mm7, mm1              ; mm7 = r0 p0..p3
    1.84 +        psubusw     mm7, mm5              ; mm7 = r0 p0..p3 - r1 p0..p3
    1.85 +        psubusw     mm5, mm1              ; mm5 = r1 p0..p3 - r0 p0..p3
    1.86 +        paddusw     mm7, mm5              ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
    1.87 +        pcmpgtw     mm7, mm2
    1.88 +
    1.89 +        movq        mm6, [rbx + 64 ]      ; mm6 = kernel 4 modifiers
    1.90 +        movq        mm5, [rsi + 2*rax]    ; mm4 = r2 p0..p7
    1.91 +        punpcklbw   mm5, mm0              ; mm5 = r2 p0..p3
    1.92 +        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
    1.93 +        paddusw     mm3, mm6              ; mm3 += mm5
    1.94 +
    1.95 +        ; thresholding
    1.96 +        movq        mm6, mm1              ; mm6 = r0 p0..p3
    1.97 +        psubusw     mm6, mm5              ; mm6 = r0 p0..p3 - r2 p0..p3
    1.98 +        psubusw     mm5, mm1              ; mm5 = r2 p0..p3 - r2 p0..p3
    1.99 +        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
   1.100 +        pcmpgtw     mm6, mm2
   1.101 +        por         mm7, mm6              ; accumulate thresholds
   1.102 +
   1.103 +
   1.104 +        neg         rax
   1.105 +        movq        mm6, [rbx ]           ; kernel 0 taps
   1.106 +        movq        mm5, [rsi+2*rax]      ; mm4 = r-2 p0..p7
   1.107 +        punpcklbw   mm5, mm0              ; mm5 = r-2 p0..p3
   1.108 +        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
   1.109 +        paddusw     mm3, mm6              ; mm3 += mm5
   1.110 +
   1.111 +        ; thresholding
   1.112 +        movq        mm6, mm1              ; mm6 = r0 p0..p3
   1.113 +        psubusw     mm6, mm5              ; mm6 = p0..p3 - r-2 p0..p3
   1.114 +        psubusw     mm5, mm1              ; mm5 = r-2 p0..p3 - p0..p3
   1.115 +        paddusw     mm6, mm5              ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
   1.116 +        pcmpgtw     mm6, mm2
   1.117 +        por         mm7, mm6              ; accumulate thresholds
   1.118 +
   1.119 +        movq        mm6, [rbx + 16]       ; kernel 1 taps
   1.120 +        movq        mm4, [rsi+rax]        ; mm4 = r-1 p0..p7
   1.121 +        punpcklbw   mm4, mm0              ; mm4 = r-1 p0..p3
   1.122 +        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
   1.123 +        paddusw     mm3, mm6              ; mm3 += mm5
   1.124 +
   1.125 +        ; thresholding
   1.126 +        movq        mm6, mm1              ; mm6 = r0 p0..p3
   1.127 +        psubusw     mm6, mm4              ; mm6 = p0..p3 - r-2 p0..p3
   1.128 +        psubusw     mm4, mm1              ; mm5 = r-1 p0..p3 - p0..p3
   1.129 +        paddusw     mm6, mm4              ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
   1.130 +        pcmpgtw     mm6, mm2
   1.131 +        por         mm7, mm6              ; accumulate thresholds
   1.132 +
   1.133 +
   1.134 +        paddusw     mm3, RD               ; mm3 += round value
   1.135 +        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
   1.136 +
   1.137 +        pand        mm1, mm7              ; mm1 select vals > thresh from source
   1.138 +        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
   1.139 +        paddusw     mm1, mm7              ; combination
   1.140 +
   1.141 +        packuswb    mm1, mm0              ; pack to bytes
   1.142 +
   1.143 +        movd        [rdi], mm1            ;
   1.144 +        neg         rax                   ; pitch is positive
   1.145 +
   1.146 +
   1.147 +        add         rsi, 4
   1.148 +        add         rdi, 4
   1.149 +        add         rdx, 4
   1.150 +
   1.151 +        cmp         edx, dword ptr arg(5) ;cols
   1.152 +        jl          .nextcol
   1.153 +        ; done with the all cols, start the across filtering in place
   1.154 +        sub         rsi, rdx
   1.155 +        sub         rdi, rdx
   1.156 +
   1.157 +
   1.158 +        push        rax
   1.159 +        xor         rdx,    rdx
   1.160 +        mov         rax,    [rdi-4];
   1.161 +
   1.162 +.acrossnextcol:
   1.163 +        pxor        mm7, mm7              ; mm7 = 00000000
   1.164 +        movq        mm6, [rbx + 32 ]      ;
   1.165 +        movq        mm4, [rdi+rdx]        ; mm4 = p0..p7
   1.166 +        movq        mm3, mm4              ; mm3 = p0..p7
   1.167 +        punpcklbw   mm3, mm0              ; mm3 = p0..p3
   1.168 +        movq        mm1, mm3              ; mm1 = p0..p3
   1.169 +        pmullw      mm3, mm6              ; mm3 *= kernel 2 modifiers
   1.170 +
   1.171 +        movq        mm6, [rbx + 48]
   1.172 +        psrlq       mm4, 8                ; mm4 = p1..p7
   1.173 +        movq        mm5, mm4              ; mm5 = p1..p7
   1.174 +        punpcklbw   mm5, mm0              ; mm5 = p1..p4
   1.175 +        pmullw      mm6, mm5              ; mm6 *= p1..p4 * kernel 3 modifiers
   1.176 +        paddusw     mm3, mm6              ; mm3 += mm6
   1.177 +
   1.178 +        ; thresholding
   1.179 +        movq        mm7, mm1              ; mm7 = p0..p3
   1.180 +        psubusw     mm7, mm5              ; mm7 = p0..p3 - p1..p4
   1.181 +        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
   1.182 +        paddusw     mm7, mm5              ; mm7 = abs(p0..p3 - p1..p4)
   1.183 +        pcmpgtw     mm7, mm2
   1.184 +
   1.185 +        movq        mm6, [rbx + 64 ]
   1.186 +        psrlq       mm4, 8                ; mm4 = p2..p7
   1.187 +        movq        mm5, mm4              ; mm5 = p2..p7
   1.188 +        punpcklbw   mm5, mm0              ; mm5 = p2..p5
   1.189 +        pmullw      mm6, mm5              ; mm5 *= kernel 4 modifiers
   1.190 +        paddusw     mm3, mm6              ; mm3 += mm5
   1.191 +
   1.192 +        ; thresholding
   1.193 +        movq        mm6, mm1              ; mm6 = p0..p3
   1.194 +        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
   1.195 +        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
   1.196 +        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
   1.197 +        pcmpgtw     mm6, mm2
   1.198 +        por         mm7, mm6              ; accumulate thresholds
   1.199 +
   1.200 +
   1.201 +        movq        mm6, [rbx ]
   1.202 +        movq        mm4, [rdi+rdx-2]      ; mm4 = p-2..p5
   1.203 +        movq        mm5, mm4              ; mm5 = p-2..p5
   1.204 +        punpcklbw   mm5, mm0              ; mm5 = p-2..p1
   1.205 +        pmullw      mm6, mm5              ; mm5 *= kernel 0 modifiers
   1.206 +        paddusw     mm3, mm6              ; mm3 += mm5
   1.207 +
   1.208 +        ; thresholding
   1.209 +        movq        mm6, mm1              ; mm6 = p0..p3
   1.210 +        psubusw     mm6, mm5              ; mm6 = p0..p3 - p1..p4
   1.211 +        psubusw     mm5, mm1              ; mm5 = p1..p4 - p0..p3
   1.212 +        paddusw     mm6, mm5              ; mm6 = abs(p0..p3 - p1..p4)
   1.213 +        pcmpgtw     mm6, mm2
   1.214 +        por         mm7, mm6              ; accumulate thresholds
   1.215 +
   1.216 +        movq        mm6, [rbx + 16]
   1.217 +        psrlq       mm4, 8                ; mm4 = p-1..p5
   1.218 +        punpcklbw   mm4, mm0              ; mm4 = p-1..p2
   1.219 +        pmullw      mm6, mm4              ; mm4 *= kernel 1 modifiers.
   1.220 +        paddusw     mm3, mm6              ; mm3 += mm5
   1.221 +
   1.222 +        ; thresholding
   1.223 +        movq        mm6, mm1              ; mm6 = p0..p3
   1.224 +        psubusw     mm6, mm4              ; mm6 = p0..p3 - p1..p4
   1.225 +        psubusw     mm4, mm1              ; mm5 = p1..p4 - p0..p3
   1.226 +        paddusw     mm6, mm4              ; mm6 = abs(p0..p3 - p1..p4)
   1.227 +        pcmpgtw     mm6, mm2
   1.228 +        por         mm7, mm6              ; accumulate thresholds
   1.229 +
   1.230 +        paddusw     mm3, RD               ; mm3 += round value
   1.231 +        psraw       mm3, VP9_FILTER_SHIFT     ; mm3 /= 128
   1.232 +
   1.233 +        pand        mm1, mm7              ; mm1 select vals > thresh from source
   1.234 +        pandn       mm7, mm3              ; mm7 select vals < thresh from blurred result
   1.235 +        paddusw     mm1, mm7              ; combination
   1.236 +
   1.237 +        packuswb    mm1, mm0              ; pack to bytes
   1.238 +        mov         DWORD PTR [rdi+rdx-4],  eax   ; store previous four bytes
   1.239 +        movd        eax,    mm1
   1.240 +
   1.241 +        add         rdx, 4
   1.242 +        cmp         edx, dword ptr arg(5) ;cols
   1.243 +        jl          .acrossnextcol;
   1.244 +
   1.245 +        mov         DWORD PTR [rdi+rdx-4],  eax
   1.246 +        pop         rax
   1.247 +
   1.248 +        ; done with this rwo
   1.249 +        add         rsi,rax               ; next line
   1.250 +        movsxd      rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch?
   1.251 +        add         rdi,rax               ; next destination
   1.252 +        movsxd      rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
   1.253 +
   1.254 +        dec         rcx                   ; decrement count
   1.255 +        jnz         .nextrow               ; next row
   1.256 +        pop         rbx
   1.257 +
   1.258 +    ; begin epilog
   1.259 +    pop rdi
   1.260 +    pop rsi
   1.261 +    RESTORE_GOT
   1.262 +    UNSHADOW_ARGS
   1.263 +    pop         rbp
   1.264 +    ret
   1.265 +%undef RD
   1.266 +
   1.267 +
   1.268 +;void vp9_mbpost_proc_down_mmx(unsigned char *dst,
   1.269 +;                             int pitch, int rows, int cols,int flimit)
   1.270 +extern sym(vp9_rv)
   1.271 +global sym(vp9_mbpost_proc_down_mmx) PRIVATE
   1.272 +sym(vp9_mbpost_proc_down_mmx):
   1.273 +    push        rbp
   1.274 +    mov         rbp, rsp
   1.275 +    SHADOW_ARGS_TO_STACK 5
   1.276 +    GET_GOT     rbx
   1.277 +    push        rsi
   1.278 +    push        rdi
   1.279 +    ; end prolog
   1.280 +
   1.281 +    ALIGN_STACK 16, rax
   1.282 +    sub         rsp, 136
   1.283 +
   1.284 +    ; unsigned char d[16][8] at [rsp]
   1.285 +    ; create flimit2 at [rsp+128]
   1.286 +    mov         eax, dword ptr arg(4) ;flimit
   1.287 +    mov         [rsp+128], eax
   1.288 +    mov         [rsp+128+4], eax
   1.289 +%define flimit2 [rsp+128]
   1.290 +
   1.291 +%if ABI_IS_32BIT=0
   1.292 +    lea         r8,       [GLOBAL(sym(vp9_rv))]
   1.293 +%endif
   1.294 +
   1.295 +    ;rows +=8;
   1.296 +    add         dword ptr arg(2), 8
   1.297 +
   1.298 +    ;for(c=0; c<cols; c+=4)
   1.299 +.loop_col:
   1.300 +            mov         rsi,        arg(0)  ;s
   1.301 +            pxor        mm0,        mm0     ;
   1.302 +
   1.303 +            movsxd      rax,        dword ptr arg(1) ;pitch       ;
   1.304 +            neg         rax                                     ; rax = -pitch
   1.305 +
   1.306 +            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
   1.307 +            neg         rax
   1.308 +
   1.309 +
   1.310 +            pxor        mm5,        mm5
   1.311 +            pxor        mm6,        mm6     ;
   1.312 +
   1.313 +            pxor        mm7,        mm7     ;
   1.314 +            mov         rdi,        rsi
   1.315 +
   1.316 +            mov         rcx,        15          ;
   1.317 +
   1.318 +.loop_initvar:
   1.319 +            movd        mm1,        DWORD PTR [rdi];
   1.320 +            punpcklbw   mm1,        mm0     ;
   1.321 +
   1.322 +            paddw       mm5,        mm1     ;
   1.323 +            pmullw      mm1,        mm1     ;
   1.324 +
   1.325 +            movq        mm2,        mm1     ;
   1.326 +            punpcklwd   mm1,        mm0     ;
   1.327 +
   1.328 +            punpckhwd   mm2,        mm0     ;
   1.329 +            paddd       mm6,        mm1     ;
   1.330 +
   1.331 +            paddd       mm7,        mm2     ;
   1.332 +            lea         rdi,        [rdi+rax]   ;
   1.333 +
   1.334 +            dec         rcx
   1.335 +            jne         .loop_initvar
   1.336 +            ;save the var and sum
   1.337 +            xor         rdx,        rdx
   1.338 +.loop_row:
   1.339 +            movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
   1.340 +            movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
   1.341 +
   1.342 +            punpcklbw   mm1,        mm0
   1.343 +            punpcklbw   mm2,        mm0
   1.344 +
   1.345 +            paddw       mm5,        mm2
   1.346 +            psubw       mm5,        mm1
   1.347 +
   1.348 +            pmullw      mm2,        mm2
   1.349 +            movq        mm4,        mm2
   1.350 +
   1.351 +            punpcklwd   mm2,        mm0
   1.352 +            punpckhwd   mm4,        mm0
   1.353 +
   1.354 +            paddd       mm6,        mm2
   1.355 +            paddd       mm7,        mm4
   1.356 +
   1.357 +            pmullw      mm1,        mm1
   1.358 +            movq        mm2,        mm1
   1.359 +
   1.360 +            punpcklwd   mm1,        mm0
   1.361 +            psubd       mm6,        mm1
   1.362 +
   1.363 +            punpckhwd   mm2,        mm0
   1.364 +            psubd       mm7,        mm2
   1.365 +
   1.366 +
   1.367 +            movq        mm3,        mm6
   1.368 +            pslld       mm3,        4
   1.369 +
   1.370 +            psubd       mm3,        mm6
   1.371 +            movq        mm1,        mm5
   1.372 +
   1.373 +            movq        mm4,        mm5
   1.374 +            pmullw      mm1,        mm1
   1.375 +
   1.376 +            pmulhw      mm4,        mm4
   1.377 +            movq        mm2,        mm1
   1.378 +
   1.379 +            punpcklwd   mm1,        mm4
   1.380 +            punpckhwd   mm2,        mm4
   1.381 +
   1.382 +            movq        mm4,        mm7
   1.383 +            pslld       mm4,        4
   1.384 +
   1.385 +            psubd       mm4,        mm7
   1.386 +
   1.387 +            psubd       mm3,        mm1
   1.388 +            psubd       mm4,        mm2
   1.389 +
   1.390 +            psubd       mm3,        flimit2
   1.391 +            psubd       mm4,        flimit2
   1.392 +
   1.393 +            psrad       mm3,        31
   1.394 +            psrad       mm4,        31
   1.395 +
   1.396 +            packssdw    mm3,        mm4
   1.397 +            packsswb    mm3,        mm0
   1.398 +
   1.399 +            movd        mm1,        DWORD PTR [rsi+rax*8]
   1.400 +
   1.401 +            movq        mm2,        mm1
   1.402 +            punpcklbw   mm1,        mm0
   1.403 +
   1.404 +            paddw       mm1,        mm5
   1.405 +            mov         rcx,        rdx
   1.406 +
   1.407 +            and         rcx,        127
   1.408 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1
   1.409 +            push        rax
   1.410 +            lea         rax,        [GLOBAL(sym(vp9_rv))]
   1.411 +            movq        mm4,        [rax + rcx*2] ;vp9_rv[rcx*2]
   1.412 +            pop         rax
   1.413 +%elif ABI_IS_32BIT=0
   1.414 +            movq        mm4,        [r8 + rcx*2] ;vp9_rv[rcx*2]
   1.415 +%else
   1.416 +            movq        mm4,        [sym(vp9_rv) + rcx*2]
   1.417 +%endif
   1.418 +            paddw       mm1,        mm4
   1.419 +            ;paddw     xmm1,       eight8s
   1.420 +            psraw       mm1,        4
   1.421 +
   1.422 +            packuswb    mm1,        mm0
   1.423 +            pand        mm1,        mm3
   1.424 +
   1.425 +            pandn       mm3,        mm2
   1.426 +            por         mm1,        mm3
   1.427 +
   1.428 +            and         rcx,        15
   1.429 +            movd        DWORD PTR   [rsp+rcx*4], mm1 ;d[rcx*4]
   1.430 +
   1.431 +            mov         rcx,        rdx
   1.432 +            sub         rcx,        8
   1.433 +
   1.434 +            and         rcx,        15
   1.435 +            movd        mm1,        DWORD PTR [rsp+rcx*4] ;d[rcx*4]
   1.436 +
   1.437 +            movd        [rsi],      mm1
   1.438 +            lea         rsi,        [rsi+rax]
   1.439 +
   1.440 +            lea         rdi,        [rdi+rax]
   1.441 +            add         rdx,        1
   1.442 +
   1.443 +            cmp         edx,        dword arg(2) ;rows
   1.444 +            jl          .loop_row
   1.445 +
   1.446 +
   1.447 +        add         dword arg(0), 4 ; s += 4
   1.448 +        sub         dword arg(3), 4 ; cols -= 4
   1.449 +        cmp         dword arg(3), 0
   1.450 +        jg          .loop_col
   1.451 +
   1.452 +    add         rsp, 136
   1.453 +    pop         rsp
   1.454 +
   1.455 +    ; begin epilog
   1.456 +    pop rdi
   1.457 +    pop rsi
   1.458 +    RESTORE_GOT
   1.459 +    UNSHADOW_ARGS
   1.460 +    pop         rbp
   1.461 +    ret
   1.462 +%undef flimit2
   1.463 +
   1.464 +
   1.465 +;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise,
   1.466 +;                            unsigned char blackclamp[16],
   1.467 +;                            unsigned char whiteclamp[16],
   1.468 +;                            unsigned char bothclamp[16],
   1.469 +;                            unsigned int width, unsigned int height, int pitch)
   1.470 +extern sym(rand)
   1.471 +global sym(vp9_plane_add_noise_mmx) PRIVATE
   1.472 +sym(vp9_plane_add_noise_mmx):
   1.473 +    push        rbp
   1.474 +    mov         rbp, rsp
   1.475 +    SHADOW_ARGS_TO_STACK 8
   1.476 +    GET_GOT     rbx
   1.477 +    push        rsi
   1.478 +    push        rdi
   1.479 +    ; end prolog
   1.480 +
   1.481 +.addnoise_loop:
   1.482 +    call sym(rand) WRT_PLT
   1.483 +    mov     rcx, arg(1) ;noise
   1.484 +    and     rax, 0xff
   1.485 +    add     rcx, rax
   1.486 +
   1.487 +    ; we rely on the fact that the clamping vectors are stored contiguously
   1.488 +    ; in black/white/both order. Note that we have to reload this here because
   1.489 +    ; rdx could be trashed by rand()
   1.490 +    mov     rdx, arg(2) ; blackclamp
   1.491 +
   1.492 +
   1.493 +            mov     rdi, rcx
   1.494 +            movsxd  rcx, dword arg(5) ;[Width]
   1.495 +            mov     rsi, arg(0) ;Pos
   1.496 +            xor         rax,rax
   1.497 +
   1.498 +.addnoise_nextset:
   1.499 +            movq        mm1,[rsi+rax]         ; get the source
   1.500 +
   1.501 +            psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
   1.502 +            paddusb     mm1, [rdx+32] ;bothclamp
   1.503 +            psubusb     mm1, [rdx+16] ;whiteclamp
   1.504 +
   1.505 +            movq        mm2,[rdi+rax]         ; get the noise for this line
   1.506 +            paddb       mm1,mm2              ; add it in
   1.507 +            movq        [rsi+rax],mm1         ; store the result
   1.508 +
   1.509 +            add         rax,8                 ; move to the next line
   1.510 +
   1.511 +            cmp         rax, rcx
   1.512 +            jl          .addnoise_nextset
   1.513 +
   1.514 +    movsxd  rax, dword arg(7) ; Pitch
   1.515 +    add     arg(0), rax ; Start += Pitch
   1.516 +    sub     dword arg(6), 1   ; Height -= 1
   1.517 +    jg      .addnoise_loop
   1.518 +
   1.519 +    ; begin epilog
   1.520 +    pop rdi
   1.521 +    pop rsi
   1.522 +    RESTORE_GOT
   1.523 +    UNSHADOW_ARGS
   1.524 +    pop         rbp
   1.525 +    ret
   1.526 +
   1.527 +
   1.528 +SECTION_RODATA
   1.529 +align 16
   1.530 +Blur:
   1.531 +    times 16 dw 16
   1.532 +    times  8 dw 64
   1.533 +    times 16 dw 16
   1.534 +    times  8 dw  0
   1.535 +
   1.536 +rd:
   1.537 +    times 4 dw 0x40

mercurial