media/libvpx/vp9/common/x86/vp9_postproc_sse2.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp9/common/x86/vp9_postproc_sse2.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,695 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +
    1.15 +%include "vpx_ports/x86_abi_support.asm"
    1.16 +
    1.17 +;void vp9_post_proc_down_and_across_xmm
    1.18 +;(
    1.19 +;    unsigned char *src_ptr,
    1.20 +;    unsigned char *dst_ptr,
    1.21 +;    int src_pixels_per_line,
    1.22 +;    int dst_pixels_per_line,
    1.23 +;    int rows,
    1.24 +;    int cols,
    1.25 +;    int flimit
    1.26 +;)
    1.27 +global sym(vp9_post_proc_down_and_across_xmm) PRIVATE
    1.28 +sym(vp9_post_proc_down_and_across_xmm):
    1.29 +    push        rbp
    1.30 +    mov         rbp, rsp
    1.31 +    SHADOW_ARGS_TO_STACK 7
    1.32 +    SAVE_XMM 7
    1.33 +    GET_GOT     rbx
    1.34 +    push        rsi
    1.35 +    push        rdi
    1.36 +    ; end prolog
    1.37 +
    1.38 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1
    1.39 +    ALIGN_STACK 16, rax
    1.40 +    ; move the global rd onto the stack, since we don't have enough registers
    1.41 +    ; to do PIC addressing
    1.42 +    movdqa      xmm0, [GLOBAL(rd42)]
    1.43 +    sub         rsp, 16
    1.44 +    movdqa      [rsp], xmm0
    1.45 +%define RD42 [rsp]
    1.46 +%else
    1.47 +%define RD42 [GLOBAL(rd42)]
    1.48 +%endif
    1.49 +
    1.50 +
    1.51 +        movd        xmm2,       dword ptr arg(6) ;flimit
    1.52 +        punpcklwd   xmm2,       xmm2
    1.53 +        punpckldq   xmm2,       xmm2
    1.54 +        punpcklqdq  xmm2,       xmm2
    1.55 +
    1.56 +        mov         rsi,        arg(0) ;src_ptr
    1.57 +        mov         rdi,        arg(1) ;dst_ptr
    1.58 +
    1.59 +        movsxd      rcx,        DWORD PTR arg(4) ;rows
    1.60 +        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
    1.61 +        pxor        xmm0,       xmm0              ; mm0 = 00000000
    1.62 +
    1.63 +.nextrow:
    1.64 +
    1.65 +        xor         rdx,        rdx       ; clear out rdx for use as loop counter
    1.66 +.nextcol:
    1.67 +        movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
    1.68 +        punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
    1.69 +        movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
    1.70 +        psllw       xmm3,       2                       ;
    1.71 +
    1.72 +        movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
    1.73 +        punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
    1.74 +        paddusw     xmm3,       xmm5                    ; mm3 += mm6
    1.75 +
    1.76 +        ; thresholding
    1.77 +        movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
    1.78 +        psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
    1.79 +        psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
    1.80 +        paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
    1.81 +        pcmpgtw     xmm7,       xmm2
    1.82 +
    1.83 +        movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
    1.84 +        punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
    1.85 +        paddusw     xmm3,       xmm5                    ; mm3 += mm5
    1.86 +
    1.87 +        ; thresholding
    1.88 +        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
    1.89 +        psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
    1.90 +        psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
    1.91 +        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
    1.92 +        pcmpgtw     xmm6,       xmm2
    1.93 +        por         xmm7,       xmm6                    ; accumulate thresholds
    1.94 +
    1.95 +
    1.96 +        neg         rax
    1.97 +        movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
    1.98 +        punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
    1.99 +        paddusw     xmm3,       xmm5                    ; mm3 += mm5
   1.100 +
   1.101 +        ; thresholding
   1.102 +        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
   1.103 +        psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
   1.104 +        psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
   1.105 +        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
   1.106 +        pcmpgtw     xmm6,       xmm2
   1.107 +        por         xmm7,       xmm6                    ; accumulate thresholds
   1.108 +
   1.109 +        movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
   1.110 +        punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
   1.111 +        paddusw     xmm3,       xmm4                    ; mm3 += mm5
   1.112 +
   1.113 +        ; thresholding
   1.114 +        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
   1.115 +        psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
   1.116 +        psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
   1.117 +        paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
   1.118 +        pcmpgtw     xmm6,       xmm2
   1.119 +        por         xmm7,       xmm6                    ; accumulate thresholds
   1.120 +
   1.121 +
   1.122 +        paddusw     xmm3,       RD42                    ; mm3 += round value
   1.123 +        psraw       xmm3,       3                       ; mm3 /= 8
   1.124 +
   1.125 +        pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
   1.126 +        pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
   1.127 +        paddusw     xmm1,       xmm7                    ; combination
   1.128 +
   1.129 +        packuswb    xmm1,       xmm0                    ; pack to bytes
   1.130 +        movq        QWORD PTR [rdi], xmm1             ;
   1.131 +
   1.132 +        neg         rax                   ; pitch is positive
   1.133 +        add         rsi,        8
   1.134 +        add         rdi,        8
   1.135 +
   1.136 +        add         rdx,        8
   1.137 +        cmp         edx,        dword arg(5) ;cols
   1.138 +
   1.139 +        jl          .nextcol
   1.140 +
   1.141 +        ; done with the all cols, start the across filtering in place
   1.142 +        sub         rsi,        rdx
   1.143 +        sub         rdi,        rdx
   1.144 +
   1.145 +        xor         rdx,        rdx
   1.146 +        movq        mm0,        QWORD PTR [rdi-8];
   1.147 +
   1.148 +.acrossnextcol:
   1.149 +        movq        xmm7,       QWORD PTR [rdi +rdx -2]
   1.150 +        movd        xmm4,       DWORD PTR [rdi +rdx +6]
   1.151 +
   1.152 +        pslldq      xmm4,       8
   1.153 +        por         xmm4,       xmm7
   1.154 +
   1.155 +        movdqa      xmm3,       xmm4
   1.156 +        psrldq      xmm3,       2
   1.157 +        punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
   1.158 +        movdqa      xmm1,       xmm3              ; mm1 = p0..p3
   1.159 +        psllw       xmm3,       2
   1.160 +
   1.161 +
   1.162 +        movdqa      xmm5,       xmm4
   1.163 +        psrldq      xmm5,       3
   1.164 +        punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
   1.165 +        paddusw     xmm3,       xmm5              ; mm3 += mm6
   1.166 +
   1.167 +        ; thresholding
   1.168 +        movdqa      xmm7,       xmm1              ; mm7 = p0..p3
   1.169 +        psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
   1.170 +        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
   1.171 +        paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
   1.172 +        pcmpgtw     xmm7,       xmm2
   1.173 +
   1.174 +        movdqa      xmm5,       xmm4
   1.175 +        psrldq      xmm5,       4
   1.176 +        punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
   1.177 +        paddusw     xmm3,       xmm5              ; mm3 += mm5
   1.178 +
   1.179 +        ; thresholding
   1.180 +        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
   1.181 +        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
   1.182 +        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
   1.183 +        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
   1.184 +        pcmpgtw     xmm6,       xmm2
   1.185 +        por         xmm7,       xmm6              ; accumulate thresholds
   1.186 +
   1.187 +
   1.188 +        movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
   1.189 +        punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
   1.190 +        paddusw     xmm3,       xmm5              ; mm3 += mm5
   1.191 +
   1.192 +        ; thresholding
   1.193 +        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
   1.194 +        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
   1.195 +        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
   1.196 +        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
   1.197 +        pcmpgtw     xmm6,       xmm2
   1.198 +        por         xmm7,       xmm6              ; accumulate thresholds
   1.199 +
   1.200 +        psrldq      xmm4,       1                   ; mm4 = p-1..p5
   1.201 +        punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
   1.202 +        paddusw     xmm3,       xmm4              ; mm3 += mm5
   1.203 +
   1.204 +        ; thresholding
   1.205 +        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
   1.206 +        psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
   1.207 +        psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
   1.208 +        paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
   1.209 +        pcmpgtw     xmm6,       xmm2
   1.210 +        por         xmm7,       xmm6              ; accumulate thresholds
   1.211 +
   1.212 +        paddusw     xmm3,       RD42              ; mm3 += round value
   1.213 +        psraw       xmm3,       3                 ; mm3 /= 8
   1.214 +
   1.215 +        pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
   1.216 +        pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
   1.217 +        paddusw     xmm1,       xmm7              ; combination
   1.218 +
   1.219 +        packuswb    xmm1,       xmm0              ; pack to bytes
   1.220 +        movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
   1.221 +        movdq2q     mm0,        xmm1
   1.222 +
   1.223 +        add         rdx,        8
   1.224 +        cmp         edx,        dword arg(5) ;cols
   1.225 +        jl          .acrossnextcol;
   1.226 +
   1.227 +        ; last 8 pixels
   1.228 +        movq        QWORD PTR [rdi+rdx-8],  mm0
   1.229 +
   1.230 +        ; done with this rwo
   1.231 +        add         rsi,rax               ; next line
   1.232 +        mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
   1.233 +        add         rdi,rax               ; next destination
   1.234 +        mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
   1.235 +
   1.236 +        dec         rcx                   ; decrement count
   1.237 +        jnz         .nextrow              ; next row
   1.238 +
   1.239 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1
   1.240 +    add rsp,16
   1.241 +    pop rsp
   1.242 +%endif
   1.243 +    ; begin epilog
   1.244 +    pop rdi
   1.245 +    pop rsi
   1.246 +    RESTORE_GOT
   1.247 +    RESTORE_XMM
   1.248 +    UNSHADOW_ARGS
   1.249 +    pop         rbp
   1.250 +    ret
   1.251 +%undef RD42
   1.252 +
   1.253 +
   1.254 +;void vp9_mbpost_proc_down_xmm(unsigned char *dst,
   1.255 +;                            int pitch, int rows, int cols,int flimit)
   1.256 +extern sym(vp9_rv)
   1.257 +global sym(vp9_mbpost_proc_down_xmm) PRIVATE
   1.258 +sym(vp9_mbpost_proc_down_xmm):
   1.259 +    push        rbp
   1.260 +    mov         rbp, rsp
   1.261 +    SHADOW_ARGS_TO_STACK 5
   1.262 +    SAVE_XMM 7
   1.263 +    GET_GOT     rbx
   1.264 +    push        rsi
   1.265 +    push        rdi
   1.266 +    ; end prolog
   1.267 +
   1.268 +    ALIGN_STACK 16, rax
   1.269 +    sub         rsp, 128+16
   1.270 +
   1.271 +    ; unsigned char d[16][8] at [rsp]
   1.272 +    ; create flimit2 at [rsp+128]
   1.273 +    mov         eax, dword ptr arg(4) ;flimit
   1.274 +    mov         [rsp+128], eax
   1.275 +    mov         [rsp+128+4], eax
   1.276 +    mov         [rsp+128+8], eax
   1.277 +    mov         [rsp+128+12], eax
   1.278 +%define flimit4 [rsp+128]
   1.279 +
   1.280 +%if ABI_IS_32BIT=0
   1.281 +    lea         r8,       [GLOBAL(sym(vp9_rv))]
   1.282 +%endif
   1.283 +
   1.284 +    ;rows +=8;
   1.285 +    add         dword arg(2), 8
   1.286 +
   1.287 +    ;for(c=0; c<cols; c+=8)
   1.288 +.loop_col:
   1.289 +            mov         rsi,        arg(0) ; s
   1.290 +            pxor        xmm0,       xmm0        ;
   1.291 +
   1.292 +            movsxd      rax,        dword ptr arg(1) ;pitch       ;
   1.293 +            neg         rax                                     ; rax = -pitch
   1.294 +
   1.295 +            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
   1.296 +            neg         rax
   1.297 +
   1.298 +
   1.299 +            pxor        xmm5,       xmm5
   1.300 +            pxor        xmm6,       xmm6        ;
   1.301 +
   1.302 +            pxor        xmm7,       xmm7        ;
   1.303 +            mov         rdi,        rsi
   1.304 +
   1.305 +            mov         rcx,        15          ;
   1.306 +
   1.307 +.loop_initvar:
   1.308 +            movq        xmm1,       QWORD PTR [rdi];
   1.309 +            punpcklbw   xmm1,       xmm0        ;
   1.310 +
   1.311 +            paddw       xmm5,       xmm1        ;
   1.312 +            pmullw      xmm1,       xmm1        ;
   1.313 +
   1.314 +            movdqa      xmm2,       xmm1        ;
   1.315 +            punpcklwd   xmm1,       xmm0        ;
   1.316 +
   1.317 +            punpckhwd   xmm2,       xmm0        ;
   1.318 +            paddd       xmm6,       xmm1        ;
   1.319 +
   1.320 +            paddd       xmm7,       xmm2        ;
   1.321 +            lea         rdi,        [rdi+rax]   ;
   1.322 +
   1.323 +            dec         rcx
   1.324 +            jne         .loop_initvar
   1.325 +            ;save the var and sum
   1.326 +            xor         rdx,        rdx
   1.327 +.loop_row:
   1.328 +            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
   1.329 +            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
   1.330 +
   1.331 +            punpcklbw   xmm1,       xmm0
   1.332 +            punpcklbw   xmm2,       xmm0
   1.333 +
   1.334 +            paddw       xmm5,       xmm2
   1.335 +            psubw       xmm5,       xmm1
   1.336 +
   1.337 +            pmullw      xmm2,       xmm2
   1.338 +            movdqa      xmm4,       xmm2
   1.339 +
   1.340 +            punpcklwd   xmm2,       xmm0
   1.341 +            punpckhwd   xmm4,       xmm0
   1.342 +
   1.343 +            paddd       xmm6,       xmm2
   1.344 +            paddd       xmm7,       xmm4
   1.345 +
   1.346 +            pmullw      xmm1,       xmm1
   1.347 +            movdqa      xmm2,       xmm1
   1.348 +
   1.349 +            punpcklwd   xmm1,       xmm0
   1.350 +            psubd       xmm6,       xmm1
   1.351 +
   1.352 +            punpckhwd   xmm2,       xmm0
   1.353 +            psubd       xmm7,       xmm2
   1.354 +
   1.355 +
   1.356 +            movdqa      xmm3,       xmm6
   1.357 +            pslld       xmm3,       4
   1.358 +
   1.359 +            psubd       xmm3,       xmm6
   1.360 +            movdqa      xmm1,       xmm5
   1.361 +
   1.362 +            movdqa      xmm4,       xmm5
   1.363 +            pmullw      xmm1,       xmm1
   1.364 +
   1.365 +            pmulhw      xmm4,       xmm4
   1.366 +            movdqa      xmm2,       xmm1
   1.367 +
   1.368 +            punpcklwd   xmm1,       xmm4
   1.369 +            punpckhwd   xmm2,       xmm4
   1.370 +
   1.371 +            movdqa      xmm4,       xmm7
   1.372 +            pslld       xmm4,       4
   1.373 +
   1.374 +            psubd       xmm4,       xmm7
   1.375 +
   1.376 +            psubd       xmm3,       xmm1
   1.377 +            psubd       xmm4,       xmm2
   1.378 +
   1.379 +            psubd       xmm3,       flimit4
   1.380 +            psubd       xmm4,       flimit4
   1.381 +
   1.382 +            psrad       xmm3,       31
   1.383 +            psrad       xmm4,       31
   1.384 +
   1.385 +            packssdw    xmm3,       xmm4
   1.386 +            packsswb    xmm3,       xmm0
   1.387 +
   1.388 +            movq        xmm1,       QWORD PTR [rsi+rax*8]
   1.389 +
   1.390 +            movq        xmm2,       xmm1
   1.391 +            punpcklbw   xmm1,       xmm0
   1.392 +
   1.393 +            paddw       xmm1,       xmm5
   1.394 +            mov         rcx,        rdx
   1.395 +
   1.396 +            and         rcx,        127
   1.397 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1
   1.398 +            push        rax
   1.399 +            lea         rax,        [GLOBAL(sym(vp9_rv))]
   1.400 +            movdqu      xmm4,       [rax + rcx*2] ;vp9_rv[rcx*2]
   1.401 +            pop         rax
   1.402 +%elif ABI_IS_32BIT=0
   1.403 +            movdqu      xmm4,       [r8 + rcx*2] ;vp9_rv[rcx*2]
   1.404 +%else
   1.405 +            movdqu      xmm4,       [sym(vp9_rv) + rcx*2]
   1.406 +%endif
   1.407 +
   1.408 +            paddw       xmm1,       xmm4
   1.409 +            ;paddw     xmm1,       eight8s
   1.410 +            psraw       xmm1,       4
   1.411 +
   1.412 +            packuswb    xmm1,       xmm0
   1.413 +            pand        xmm1,       xmm3
   1.414 +
   1.415 +            pandn       xmm3,       xmm2
   1.416 +            por         xmm1,       xmm3
   1.417 +
   1.418 +            and         rcx,        15
   1.419 +            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
   1.420 +
   1.421 +            mov         rcx,        rdx
   1.422 +            sub         rcx,        8
   1.423 +
   1.424 +            and         rcx,        15
   1.425 +            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
   1.426 +
   1.427 +            movq        [rsi],      mm0
   1.428 +            lea         rsi,        [rsi+rax]
   1.429 +
   1.430 +            lea         rdi,        [rdi+rax]
   1.431 +            add         rdx,        1
   1.432 +
   1.433 +            cmp         edx,        dword arg(2) ;rows
   1.434 +            jl          .loop_row
   1.435 +
   1.436 +        add         dword arg(0), 8 ; s += 8
   1.437 +        sub         dword arg(3), 8 ; cols -= 8
   1.438 +        cmp         dword arg(3), 0
   1.439 +        jg          .loop_col
   1.440 +
   1.441 +    add         rsp, 128+16
   1.442 +    pop         rsp
   1.443 +
   1.444 +    ; begin epilog
   1.445 +    pop rdi
   1.446 +    pop rsi
   1.447 +    RESTORE_GOT
   1.448 +    RESTORE_XMM
   1.449 +    UNSHADOW_ARGS
   1.450 +    pop         rbp
   1.451 +    ret
   1.452 +%undef flimit4
   1.453 +
   1.454 +
   1.455 +;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,
   1.456 +;                                int pitch, int rows, int cols,int flimit)
   1.457 +global sym(vp9_mbpost_proc_across_ip_xmm) PRIVATE
   1.458 +sym(vp9_mbpost_proc_across_ip_xmm):
   1.459 +    push        rbp
   1.460 +    mov         rbp, rsp
   1.461 +    SHADOW_ARGS_TO_STACK 5
   1.462 +    SAVE_XMM 7
   1.463 +    GET_GOT     rbx
   1.464 +    push        rsi
   1.465 +    push        rdi
   1.466 +    ; end prolog
   1.467 +
   1.468 +    ALIGN_STACK 16, rax
   1.469 +    sub         rsp, 16
   1.470 +
   1.471 +    ; create flimit4 at [rsp]
   1.472 +    mov         eax, dword ptr arg(4) ;flimit
   1.473 +    mov         [rsp], eax
   1.474 +    mov         [rsp+4], eax
   1.475 +    mov         [rsp+8], eax
   1.476 +    mov         [rsp+12], eax
   1.477 +%define flimit4 [rsp]
   1.478 +
   1.479 +
   1.480 +    ;for(r=0;r<rows;r++)
   1.481 +.ip_row_loop:
   1.482 +
   1.483 +        xor         rdx,    rdx ;sumsq=0;
   1.484 +        xor         rcx,    rcx ;sum=0;
   1.485 +        mov         rsi,    arg(0); s
   1.486 +        mov         rdi,    -8
   1.487 +.ip_var_loop:
   1.488 +        ;for(i=-8;i<=6;i++)
   1.489 +        ;{
   1.490 +        ;    sumsq += s[i]*s[i];
   1.491 +        ;    sum   += s[i];
   1.492 +        ;}
   1.493 +        movzx       eax, byte [rsi+rdi]
   1.494 +        add         ecx, eax
   1.495 +        mul         al
   1.496 +        add         edx, eax
   1.497 +        add         rdi, 1
   1.498 +        cmp         rdi, 6
   1.499 +        jle         .ip_var_loop
   1.500 +
   1.501 +
   1.502 +            ;mov         rax,    sumsq
   1.503 +            ;movd        xmm7,   rax
   1.504 +            movd        xmm7,   edx
   1.505 +
   1.506 +            ;mov         rax,    sum
   1.507 +            ;movd        xmm6,   rax
   1.508 +            movd        xmm6,   ecx
   1.509 +
   1.510 +            mov         rsi,    arg(0) ;s
   1.511 +            xor         rcx,    rcx
   1.512 +
   1.513 +            movsxd      rdx,    dword arg(3) ;cols
   1.514 +            add         rdx,    8
   1.515 +            pxor        mm0,    mm0
   1.516 +            pxor        mm1,    mm1
   1.517 +
   1.518 +            pxor        xmm0,   xmm0
   1.519 +.nextcol4:
   1.520 +
   1.521 +            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
   1.522 +            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
   1.523 +
   1.524 +            punpcklbw   xmm1,   xmm0                    ; expanding
   1.525 +            punpcklbw   xmm2,   xmm0                    ; expanding
   1.526 +
   1.527 +            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
   1.528 +            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
   1.529 +
   1.530 +            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
   1.531 +            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
   1.532 +
   1.533 +            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
   1.534 +            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
   1.535 +
   1.536 +            paddd       xmm6,   xmm2
   1.537 +            paddd       xmm7,   xmm1
   1.538 +
   1.539 +            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
   1.540 +            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
   1.541 +
   1.542 +            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
   1.543 +            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
   1.544 +
   1.545 +            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
   1.546 +            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
   1.547 +
   1.548 +            paddd       xmm6,   xmm4
   1.549 +            paddd       xmm7,   xmm3
   1.550 +
   1.551 +            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
   1.552 +            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
   1.553 +
   1.554 +            paddd       xmm7,   xmm3
   1.555 +            paddd       xmm6,   xmm4
   1.556 +
   1.557 +            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
   1.558 +            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
   1.559 +
   1.560 +            paddd       xmm7,   xmm3
   1.561 +            paddd       xmm6,   xmm4
   1.562 +
   1.563 +            movdqa      xmm3,   xmm6
   1.564 +            pmaddwd     xmm3,   xmm3
   1.565 +
   1.566 +            movdqa      xmm5,   xmm7
   1.567 +            pslld       xmm5,   4
   1.568 +
   1.569 +            psubd       xmm5,   xmm7
   1.570 +            psubd       xmm5,   xmm3
   1.571 +
   1.572 +            psubd       xmm5,   flimit4
   1.573 +            psrad       xmm5,   31
   1.574 +
   1.575 +            packssdw    xmm5,   xmm0
   1.576 +            packsswb    xmm5,   xmm0
   1.577 +
   1.578 +            movd        xmm1,   DWORD PTR [rsi+rcx]
   1.579 +            movq        xmm2,   xmm1
   1.580 +
   1.581 +            punpcklbw   xmm1,   xmm0
   1.582 +            punpcklwd   xmm1,   xmm0
   1.583 +
   1.584 +            paddd       xmm1,   xmm6
   1.585 +            paddd       xmm1,   [GLOBAL(four8s)]
   1.586 +
   1.587 +            psrad       xmm1,   4
   1.588 +            packssdw    xmm1,   xmm0
   1.589 +
   1.590 +            packuswb    xmm1,   xmm0
   1.591 +            pand        xmm1,   xmm5
   1.592 +
   1.593 +            pandn       xmm5,   xmm2
   1.594 +            por         xmm5,   xmm1
   1.595 +
   1.596 +            movd        [rsi+rcx-8],  mm0
   1.597 +            movq        mm0,    mm1
   1.598 +
   1.599 +            movdq2q     mm1,    xmm5
   1.600 +            psrldq      xmm7,   12
   1.601 +
   1.602 +            psrldq      xmm6,   12
   1.603 +            add         rcx,    4
   1.604 +
   1.605 +            cmp         rcx,    rdx
   1.606 +            jl          .nextcol4
   1.607 +
   1.608 +        ;s+=pitch;
   1.609 +        movsxd rax, dword arg(1)
   1.610 +        add    arg(0), rax
   1.611 +
   1.612 +        sub dword arg(2), 1 ;rows-=1
   1.613 +        cmp dword arg(2), 0
   1.614 +        jg .ip_row_loop
   1.615 +
   1.616 +    add         rsp, 16
   1.617 +    pop         rsp
   1.618 +
   1.619 +    ; begin epilog
   1.620 +    pop rdi
   1.621 +    pop rsi
   1.622 +    RESTORE_GOT
   1.623 +    RESTORE_XMM
   1.624 +    UNSHADOW_ARGS
   1.625 +    pop         rbp
   1.626 +    ret
   1.627 +%undef flimit4
   1.628 +
   1.629 +
   1.630 +;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
   1.631 +;                            unsigned char blackclamp[16],
   1.632 +;                            unsigned char whiteclamp[16],
   1.633 +;                            unsigned char bothclamp[16],
   1.634 +;                            unsigned int width, unsigned int height, int pitch)
   1.635 +extern sym(rand)
   1.636 +global sym(vp9_plane_add_noise_wmt) PRIVATE
   1.637 +sym(vp9_plane_add_noise_wmt):
   1.638 +    push        rbp
   1.639 +    mov         rbp, rsp
   1.640 +    SHADOW_ARGS_TO_STACK 8
   1.641 +    GET_GOT     rbx
   1.642 +    push        rsi
   1.643 +    push        rdi
   1.644 +    ; end prolog
   1.645 +
   1.646 +.addnoise_loop:
   1.647 +    call sym(rand) WRT_PLT
   1.648 +    mov     rcx, arg(1) ;noise
   1.649 +    and     rax, 0xff
   1.650 +    add     rcx, rax
   1.651 +
   1.652 +    ; we rely on the fact that the clamping vectors are stored contiguously
   1.653 +    ; in black/white/both order. Note that we have to reload this here because
   1.654 +    ; rdx could be trashed by rand()
   1.655 +    mov     rdx, arg(2) ; blackclamp
   1.656 +
   1.657 +
   1.658 +            mov     rdi, rcx
   1.659 +            movsxd  rcx, dword arg(5) ;[Width]
   1.660 +            mov     rsi, arg(0) ;Pos
   1.661 +            xor         rax,rax
   1.662 +
   1.663 +.addnoise_nextset:
   1.664 +            movdqu      xmm1,[rsi+rax]         ; get the source
   1.665 +
   1.666 +            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
   1.667 +            paddusb     xmm1, [rdx+32] ;bothclamp
   1.668 +            psubusb     xmm1, [rdx+16] ;whiteclamp
   1.669 +
   1.670 +            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
   1.671 +            paddb       xmm1,xmm2              ; add it in
   1.672 +            movdqu      [rsi+rax],xmm1         ; store the result
   1.673 +
   1.674 +            add         rax,16                 ; move to the next line
   1.675 +
   1.676 +            cmp         rax, rcx
   1.677 +            jl          .addnoise_nextset
   1.678 +
   1.679 +    movsxd  rax, dword arg(7) ; Pitch
   1.680 +    add     arg(0), rax ; Start += Pitch
   1.681 +    sub     dword arg(6), 1   ; Height -= 1
   1.682 +    jg      .addnoise_loop
   1.683 +
   1.684 +    ; begin epilog
   1.685 +    pop rdi
   1.686 +    pop rsi
   1.687 +    RESTORE_GOT
   1.688 +    UNSHADOW_ARGS
   1.689 +    pop         rbp
   1.690 +    ret
   1.691 +
   1.692 +
   1.693 +SECTION_RODATA
   1.694 +align 16
   1.695 +rd42:
   1.696 +    times 8 dw 0x04
   1.697 +four8s:
   1.698 +    times 4 dd 8

mercurial