media/libvpx/vp8/common/x86/postproc_sse2.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/common/x86/postproc_sse2.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,721 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +
    1.15 +%include "vpx_ports/x86_abi_support.asm"
    1.16 +
    1.17 +;macro in deblock functions
    1.18 +%macro FIRST_2_ROWS 0
    1.19 +        movdqa      xmm4,       xmm0
    1.20 +        movdqa      xmm6,       xmm0
    1.21 +        movdqa      xmm5,       xmm1
    1.22 +        pavgb       xmm5,       xmm3
    1.23 +
    1.24 +        ;calculate absolute value
    1.25 +        psubusb     xmm4,       xmm1
    1.26 +        psubusb     xmm1,       xmm0
    1.27 +        psubusb     xmm6,       xmm3
    1.28 +        psubusb     xmm3,       xmm0
    1.29 +        paddusb     xmm4,       xmm1
    1.30 +        paddusb     xmm6,       xmm3
    1.31 +
    1.32 +        ;get threshold
    1.33 +        movdqa      xmm2,       flimit
    1.34 +        pxor        xmm1,       xmm1
    1.35 +        movdqa      xmm7,       xmm2
    1.36 +
    1.37 +        ;get mask
    1.38 +        psubusb     xmm2,       xmm4
    1.39 +        psubusb     xmm7,       xmm6
    1.40 +        pcmpeqb     xmm2,       xmm1
    1.41 +        pcmpeqb     xmm7,       xmm1
    1.42 +        por         xmm7,       xmm2
    1.43 +%endmacro
    1.44 +
    1.45 +%macro SECOND_2_ROWS 0
    1.46 +        movdqa      xmm6,       xmm0
    1.47 +        movdqa      xmm4,       xmm0
    1.48 +        movdqa      xmm2,       xmm1
    1.49 +        pavgb       xmm1,       xmm3
    1.50 +
    1.51 +        ;calculate absolute value
    1.52 +        psubusb     xmm6,       xmm2
    1.53 +        psubusb     xmm2,       xmm0
    1.54 +        psubusb     xmm4,       xmm3
    1.55 +        psubusb     xmm3,       xmm0
    1.56 +        paddusb     xmm6,       xmm2
    1.57 +        paddusb     xmm4,       xmm3
    1.58 +
    1.59 +        pavgb       xmm5,       xmm1
    1.60 +
    1.61 +        ;get threshold
    1.62 +        movdqa      xmm2,       flimit
    1.63 +        pxor        xmm1,       xmm1
    1.64 +        movdqa      xmm3,       xmm2
    1.65 +
    1.66 +        ;get mask
    1.67 +        psubusb     xmm2,       xmm6
    1.68 +        psubusb     xmm3,       xmm4
    1.69 +        pcmpeqb     xmm2,       xmm1
    1.70 +        pcmpeqb     xmm3,       xmm1
    1.71 +
    1.72 +        por         xmm7,       xmm2
    1.73 +        por         xmm7,       xmm3
    1.74 +
    1.75 +        pavgb       xmm5,       xmm0
    1.76 +
    1.77 +        ;decide if or not to use filtered value
    1.78 +        pand        xmm0,       xmm7
    1.79 +        pandn       xmm7,       xmm5
    1.80 +        paddusb     xmm0,       xmm7
    1.81 +%endmacro
    1.82 +
    1.83 +%macro UPDATE_FLIMIT 0
    1.84 +        movdqa      xmm2,       XMMWORD PTR [rbx]
    1.85 +        movdqa      [rsp],      xmm2
    1.86 +        add         rbx,        16
    1.87 +%endmacro
    1.88 +
    1.89 +;void vp8_post_proc_down_and_across_mb_row_sse2
    1.90 +;(
    1.91 +;    unsigned char *src_ptr,
    1.92 +;    unsigned char *dst_ptr,
    1.93 +;    int src_pixels_per_line,
    1.94 +;    int dst_pixels_per_line,
    1.95 +;    int cols,
    1.96 +;    int *flimits,
    1.97 +;    int size
    1.98 +;)
    1.99 +global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE
   1.100 +sym(vp8_post_proc_down_and_across_mb_row_sse2):
   1.101 +    push        rbp
   1.102 +    mov         rbp, rsp
   1.103 +    SHADOW_ARGS_TO_STACK 7
   1.104 +    SAVE_XMM 7
   1.105 +    push        rbx
   1.106 +    push        rsi
   1.107 +    push        rdi
   1.108 +    ; end prolog
   1.109 +    ALIGN_STACK 16, rax
   1.110 +    sub         rsp, 16
   1.111 +
   1.112 +        ; put flimit on stack
   1.113 +        mov         rbx,        arg(5)           ;flimits ptr
   1.114 +        UPDATE_FLIMIT
   1.115 +
   1.116 +%define flimit [rsp]
   1.117 +
   1.118 +        mov         rsi,        arg(0)           ;src_ptr
   1.119 +        mov         rdi,        arg(1)           ;dst_ptr
   1.120 +
   1.121 +        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line
   1.122 +        movsxd      rcx,        DWORD PTR arg(6) ;rows in a macroblock
   1.123 +.nextrow:
   1.124 +        xor         rdx,        rdx              ;col
   1.125 +.nextcol:
   1.126 +        ;load current and next 2 rows
   1.127 +        movdqu      xmm0,       XMMWORD PTR [rsi]
   1.128 +        movdqu      xmm1,       XMMWORD PTR [rsi + rax]
   1.129 +        movdqu      xmm3,       XMMWORD PTR [rsi + 2*rax]
   1.130 +
   1.131 +        FIRST_2_ROWS
   1.132 +
   1.133 +        ;load above 2 rows
   1.134 +        neg         rax
   1.135 +        movdqu      xmm1,       XMMWORD PTR [rsi + 2*rax]
   1.136 +        movdqu      xmm3,       XMMWORD PTR [rsi + rax]
   1.137 +
   1.138 +        SECOND_2_ROWS
   1.139 +
   1.140 +        movdqu      XMMWORD PTR [rdi], xmm0
   1.141 +
   1.142 +        neg         rax                          ; positive stride
   1.143 +        add         rsi,        16
   1.144 +        add         rdi,        16
   1.145 +
   1.146 +        add         rdx,        16
   1.147 +        cmp         edx,        dword arg(4)     ;cols
   1.148 +        jge         .downdone
   1.149 +        UPDATE_FLIMIT
   1.150 +        jmp         .nextcol
   1.151 +
   1.152 +.downdone:
   1.153 +        ; done with the all cols, start the across filtering in place
   1.154 +        sub         rsi,        rdx
   1.155 +        sub         rdi,        rdx
   1.156 +
   1.157 +        mov         rbx,        arg(5) ; flimits
   1.158 +        UPDATE_FLIMIT
   1.159 +
   1.160 +        ; dup the first byte into the left border 8 times
   1.161 +        movq        mm1,   [rdi]
   1.162 +        punpcklbw   mm1,   mm1
   1.163 +        punpcklwd   mm1,   mm1
   1.164 +        punpckldq   mm1,   mm1
   1.165 +        mov         rdx,    -8
   1.166 +        movq        [rdi+rdx], mm1
   1.167 +
   1.168 +        ; dup the last byte into the right border
   1.169 +        movsxd      rdx,    dword arg(4)
   1.170 +        movq        mm1,   [rdi + rdx + -1]
   1.171 +        punpcklbw   mm1,   mm1
   1.172 +        punpcklwd   mm1,   mm1
   1.173 +        punpckldq   mm1,   mm1
   1.174 +        movq        [rdi+rdx], mm1
   1.175 +
   1.176 +        xor         rdx,        rdx
   1.177 +        movq        mm0,        QWORD PTR [rdi-16];
   1.178 +        movq        mm1,        QWORD PTR [rdi-8];
   1.179 +
   1.180 +.acrossnextcol:
   1.181 +        movdqu      xmm0,       XMMWORD PTR [rdi + rdx]
   1.182 +        movdqu      xmm1,       XMMWORD PTR [rdi + rdx -2]
   1.183 +        movdqu      xmm3,       XMMWORD PTR [rdi + rdx -1]
   1.184 +
   1.185 +        FIRST_2_ROWS
   1.186 +
   1.187 +        movdqu      xmm1,       XMMWORD PTR [rdi + rdx +1]
   1.188 +        movdqu      xmm3,       XMMWORD PTR [rdi + rdx +2]
   1.189 +
   1.190 +        SECOND_2_ROWS
   1.191 +
   1.192 +        movq        QWORD PTR [rdi+rdx-16], mm0  ; store previous 8 bytes
   1.193 +        movq        QWORD PTR [rdi+rdx-8], mm1   ; store previous 8 bytes
   1.194 +        movdq2q     mm0,        xmm0
   1.195 +        psrldq      xmm0,       8
   1.196 +        movdq2q     mm1,        xmm0
   1.197 +
   1.198 +        add         rdx,        16
   1.199 +        cmp         edx,        dword arg(4)     ;cols
   1.200 +        jge         .acrossdone
   1.201 +        UPDATE_FLIMIT
   1.202 +        jmp         .acrossnextcol
   1.203 +
   1.204 +.acrossdone
   1.205 +        ; last 16 pixels
   1.206 +        movq        QWORD PTR [rdi+rdx-16], mm0
   1.207 +
   1.208 +        cmp         edx,        dword arg(4)
   1.209 +        jne         .throw_last_8
   1.210 +        movq        QWORD PTR [rdi+rdx-8], mm1
   1.211 +.throw_last_8:
   1.212 +        ; done with this rwo
   1.213 +        add         rsi,rax                      ;next src line
   1.214 +        mov         eax, dword arg(3)            ;dst_pixels_per_line
   1.215 +        add         rdi,rax                      ;next destination
   1.216 +        mov         eax, dword arg(2)            ;src_pixels_per_line
   1.217 +
   1.218 +        mov         rbx,        arg(5)           ;flimits
   1.219 +        UPDATE_FLIMIT
   1.220 +
   1.221 +        dec         rcx                          ;decrement count
   1.222 +        jnz         .nextrow                     ;next row
   1.223 +
   1.224 +    add rsp, 16
   1.225 +    pop rsp
   1.226 +    ; begin epilog
   1.227 +    pop rdi
   1.228 +    pop rsi
   1.229 +    pop rbx
   1.230 +    RESTORE_XMM
   1.231 +    UNSHADOW_ARGS
   1.232 +    pop         rbp
   1.233 +    ret
   1.234 +%undef flimit
   1.235 +
   1.236 +;void vp8_mbpost_proc_down_xmm(unsigned char *dst,
   1.237 +;                            int pitch, int rows, int cols,int flimit)
   1.238 +extern sym(vp8_rv)
   1.239 +global sym(vp8_mbpost_proc_down_xmm) PRIVATE
   1.240 +sym(vp8_mbpost_proc_down_xmm):
   1.241 +    push        rbp
   1.242 +    mov         rbp, rsp
   1.243 +    SHADOW_ARGS_TO_STACK 5
   1.244 +    SAVE_XMM 7
   1.245 +    GET_GOT     rbx
   1.246 +    push        rsi
   1.247 +    push        rdi
   1.248 +    ; end prolog
   1.249 +
   1.250 +    ALIGN_STACK 16, rax
   1.251 +    sub         rsp, 128+16
   1.252 +
   1.253 +    ; unsigned char d[16][8] at [rsp]
   1.254 +    ; create flimit2 at [rsp+128]
   1.255 +    mov         eax, dword ptr arg(4) ;flimit
   1.256 +    mov         [rsp+128], eax
   1.257 +    mov         [rsp+128+4], eax
   1.258 +    mov         [rsp+128+8], eax
   1.259 +    mov         [rsp+128+12], eax
   1.260 +%define flimit4 [rsp+128]
   1.261 +
   1.262 +%if ABI_IS_32BIT=0
   1.263 +    lea         r8,       [GLOBAL(sym(vp8_rv))]
   1.264 +%endif
   1.265 +
   1.266 +    ;rows +=8;
   1.267 +    add         dword arg(2), 8
   1.268 +
   1.269 +    ;for(c=0; c<cols; c+=8)
   1.270 +.loop_col:
   1.271 +            mov         rsi,        arg(0) ; s
   1.272 +            pxor        xmm0,       xmm0        ;
   1.273 +
   1.274 +            movsxd      rax,        dword ptr arg(1) ;pitch       ;
   1.275 +
   1.276 +            ; this copies the last row down into the border 8 rows
   1.277 +            mov         rdi,        rsi
   1.278 +            mov         rdx,        arg(2)
   1.279 +            sub         rdx,        9
   1.280 +            imul        rdx,        rax
   1.281 +            lea         rdi,        [rdi+rdx]
   1.282 +            movq        xmm1,       QWORD ptr[rdi]              ; first row
   1.283 +            mov         rcx,        8
   1.284 +.init_borderd                                                    ; initialize borders
   1.285 +            lea         rdi,        [rdi + rax]
   1.286 +            movq        [rdi],      xmm1
   1.287 +
   1.288 +            dec         rcx
   1.289 +            jne         .init_borderd
   1.290 +
   1.291 +            neg         rax                                     ; rax = -pitch
   1.292 +
   1.293 +            ; this copies the first row up into the border 8 rows
   1.294 +            mov         rdi,        rsi
   1.295 +            movq        xmm1,       QWORD ptr[rdi]              ; first row
   1.296 +            mov         rcx,        8
   1.297 +.init_border                                                    ; initialize borders
   1.298 +            lea         rdi,        [rdi + rax]
   1.299 +            movq        [rdi],      xmm1
   1.300 +
   1.301 +            dec         rcx
   1.302 +            jne         .init_border
   1.303 +
   1.304 +
   1.305 +
   1.306 +            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
   1.307 +            neg         rax
   1.308 +
   1.309 +            pxor        xmm5,       xmm5
   1.310 +            pxor        xmm6,       xmm6        ;
   1.311 +
   1.312 +            pxor        xmm7,       xmm7        ;
   1.313 +            mov         rdi,        rsi
   1.314 +
   1.315 +            mov         rcx,        15          ;
   1.316 +
   1.317 +.loop_initvar:
   1.318 +            movq        xmm1,       QWORD PTR [rdi];
   1.319 +            punpcklbw   xmm1,       xmm0        ;
   1.320 +
   1.321 +            paddw       xmm5,       xmm1        ;
   1.322 +            pmullw      xmm1,       xmm1        ;
   1.323 +
   1.324 +            movdqa      xmm2,       xmm1        ;
   1.325 +            punpcklwd   xmm1,       xmm0        ;
   1.326 +
   1.327 +            punpckhwd   xmm2,       xmm0        ;
   1.328 +            paddd       xmm6,       xmm1        ;
   1.329 +
   1.330 +            paddd       xmm7,       xmm2        ;
   1.331 +            lea         rdi,        [rdi+rax]   ;
   1.332 +
   1.333 +            dec         rcx
   1.334 +            jne         .loop_initvar
   1.335 +            ;save the var and sum
   1.336 +            xor         rdx,        rdx
   1.337 +.loop_row:
   1.338 +            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
   1.339 +            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
   1.340 +
   1.341 +            punpcklbw   xmm1,       xmm0
   1.342 +            punpcklbw   xmm2,       xmm0
   1.343 +
   1.344 +            paddw       xmm5,       xmm2
   1.345 +            psubw       xmm5,       xmm1
   1.346 +
   1.347 +            pmullw      xmm2,       xmm2
   1.348 +            movdqa      xmm4,       xmm2
   1.349 +
   1.350 +            punpcklwd   xmm2,       xmm0
   1.351 +            punpckhwd   xmm4,       xmm0
   1.352 +
   1.353 +            paddd       xmm6,       xmm2
   1.354 +            paddd       xmm7,       xmm4
   1.355 +
   1.356 +            pmullw      xmm1,       xmm1
   1.357 +            movdqa      xmm2,       xmm1
   1.358 +
   1.359 +            punpcklwd   xmm1,       xmm0
   1.360 +            psubd       xmm6,       xmm1
   1.361 +
   1.362 +            punpckhwd   xmm2,       xmm0
   1.363 +            psubd       xmm7,       xmm2
   1.364 +
   1.365 +
   1.366 +            movdqa      xmm3,       xmm6
   1.367 +            pslld       xmm3,       4
   1.368 +
   1.369 +            psubd       xmm3,       xmm6
   1.370 +            movdqa      xmm1,       xmm5
   1.371 +
   1.372 +            movdqa      xmm4,       xmm5
   1.373 +            pmullw      xmm1,       xmm1
   1.374 +
   1.375 +            pmulhw      xmm4,       xmm4
   1.376 +            movdqa      xmm2,       xmm1
   1.377 +
   1.378 +            punpcklwd   xmm1,       xmm4
   1.379 +            punpckhwd   xmm2,       xmm4
   1.380 +
   1.381 +            movdqa      xmm4,       xmm7
   1.382 +            pslld       xmm4,       4
   1.383 +
   1.384 +            psubd       xmm4,       xmm7
   1.385 +
   1.386 +            psubd       xmm3,       xmm1
   1.387 +            psubd       xmm4,       xmm2
   1.388 +
   1.389 +            psubd       xmm3,       flimit4
   1.390 +            psubd       xmm4,       flimit4
   1.391 +
   1.392 +            psrad       xmm3,       31
   1.393 +            psrad       xmm4,       31
   1.394 +
   1.395 +            packssdw    xmm3,       xmm4
   1.396 +            packsswb    xmm3,       xmm0
   1.397 +
   1.398 +            movq        xmm1,       QWORD PTR [rsi+rax*8]
   1.399 +
   1.400 +            movq        xmm2,       xmm1
   1.401 +            punpcklbw   xmm1,       xmm0
   1.402 +
   1.403 +            paddw       xmm1,       xmm5
   1.404 +            mov         rcx,        rdx
   1.405 +
   1.406 +            and         rcx,        127
   1.407 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1
   1.408 +            push        rax
   1.409 +            lea         rax,        [GLOBAL(sym(vp8_rv))]
   1.410 +            movdqu      xmm4,       [rax + rcx*2] ;vp8_rv[rcx*2]
   1.411 +            pop         rax
   1.412 +%elif ABI_IS_32BIT=0
   1.413 +            movdqu      xmm4,       [r8 + rcx*2] ;vp8_rv[rcx*2]
   1.414 +%else
   1.415 +            movdqu      xmm4,       [sym(vp8_rv) + rcx*2]
   1.416 +%endif
   1.417 +
   1.418 +            paddw       xmm1,       xmm4
   1.419 +            ;paddw     xmm1,       eight8s
   1.420 +            psraw       xmm1,       4
   1.421 +
   1.422 +            packuswb    xmm1,       xmm0
   1.423 +            pand        xmm1,       xmm3
   1.424 +
   1.425 +            pandn       xmm3,       xmm2
   1.426 +            por         xmm1,       xmm3
   1.427 +
   1.428 +            and         rcx,        15
   1.429 +            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
   1.430 +
   1.431 +            mov         rcx,        rdx
   1.432 +            sub         rcx,        8
   1.433 +
   1.434 +            and         rcx,        15
   1.435 +            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
   1.436 +
   1.437 +            movq        [rsi],      mm0
   1.438 +            lea         rsi,        [rsi+rax]
   1.439 +
   1.440 +            lea         rdi,        [rdi+rax]
   1.441 +            add         rdx,        1
   1.442 +
   1.443 +            cmp         edx,        dword arg(2) ;rows
   1.444 +            jl          .loop_row
   1.445 +
   1.446 +        add         dword arg(0), 8 ; s += 8
   1.447 +        sub         dword arg(3), 8 ; cols -= 8
   1.448 +        cmp         dword arg(3), 0
   1.449 +        jg          .loop_col
   1.450 +
   1.451 +    add         rsp, 128+16
   1.452 +    pop         rsp
   1.453 +
   1.454 +    ; begin epilog
   1.455 +    pop rdi
   1.456 +    pop rsi
   1.457 +    RESTORE_GOT
   1.458 +    RESTORE_XMM
   1.459 +    UNSHADOW_ARGS
   1.460 +    pop         rbp
   1.461 +    ret
   1.462 +%undef flimit4
   1.463 +
   1.464 +
   1.465 +;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src,
   1.466 +;                                int pitch, int rows, int cols,int flimit)
   1.467 +global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE
   1.468 +sym(vp8_mbpost_proc_across_ip_xmm):
   1.469 +    push        rbp
   1.470 +    mov         rbp, rsp
   1.471 +    SHADOW_ARGS_TO_STACK 5
   1.472 +    SAVE_XMM 7
   1.473 +    GET_GOT     rbx
   1.474 +    push        rsi
   1.475 +    push        rdi
   1.476 +    ; end prolog
   1.477 +
   1.478 +    ALIGN_STACK 16, rax
   1.479 +    sub         rsp, 16
   1.480 +
   1.481 +    ; create flimit4 at [rsp]
   1.482 +    mov         eax, dword ptr arg(4) ;flimit
   1.483 +    mov         [rsp], eax
   1.484 +    mov         [rsp+4], eax
   1.485 +    mov         [rsp+8], eax
   1.486 +    mov         [rsp+12], eax
   1.487 +%define flimit4 [rsp]
   1.488 +
   1.489 +
   1.490 +    ;for(r=0;r<rows;r++)
   1.491 +.ip_row_loop:
   1.492 +
   1.493 +        xor         rdx,    rdx ;sumsq=0;
   1.494 +        xor         rcx,    rcx ;sum=0;
   1.495 +        mov         rsi,    arg(0); s
   1.496 +
   1.497 +
   1.498 +        ; dup the first byte into the left border 8 times
   1.499 +        movq        mm1,   [rsi]
   1.500 +        punpcklbw   mm1,   mm1
   1.501 +        punpcklwd   mm1,   mm1
   1.502 +        punpckldq   mm1,   mm1
   1.503 +
   1.504 +        mov         rdi,    -8
   1.505 +        movq        [rsi+rdi], mm1
   1.506 +
   1.507 +        ; dup the last byte into the right border
   1.508 +        movsxd      rdx,    dword arg(3)
   1.509 +        movq        mm1,   [rsi + rdx + -1]
   1.510 +        punpcklbw   mm1,   mm1
   1.511 +        punpcklwd   mm1,   mm1
   1.512 +        punpckldq   mm1,   mm1
   1.513 +        movq        [rsi+rdx], mm1
   1.514 +
   1.515 +.ip_var_loop:
   1.516 +        ;for(i=-8;i<=6;i++)
   1.517 +        ;{
   1.518 +        ;    sumsq += s[i]*s[i];
   1.519 +        ;    sum   += s[i];
   1.520 +        ;}
   1.521 +        movzx       eax, byte [rsi+rdi]
   1.522 +        add         ecx, eax
   1.523 +        mul         al
   1.524 +        add         edx, eax
   1.525 +        add         rdi, 1
   1.526 +        cmp         rdi, 6
   1.527 +        jle         .ip_var_loop
   1.528 +
   1.529 +
   1.530 +            ;mov         rax,    sumsq
   1.531 +            ;movd        xmm7,   rax
   1.532 +            movd        xmm7,   edx
   1.533 +
   1.534 +            ;mov         rax,    sum
   1.535 +            ;movd        xmm6,   rax
   1.536 +            movd        xmm6,   ecx
   1.537 +
   1.538 +            mov         rsi,    arg(0) ;s
   1.539 +            xor         rcx,    rcx
   1.540 +
   1.541 +            movsxd      rdx,    dword arg(3) ;cols
   1.542 +            add         rdx,    8
   1.543 +            pxor        mm0,    mm0
   1.544 +            pxor        mm1,    mm1
   1.545 +
   1.546 +            pxor        xmm0,   xmm0
   1.547 +.nextcol4:
   1.548 +
   1.549 +            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
   1.550 +            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
   1.551 +
   1.552 +            punpcklbw   xmm1,   xmm0                    ; expanding
   1.553 +            punpcklbw   xmm2,   xmm0                    ; expanding
   1.554 +
   1.555 +            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
   1.556 +            punpcklwd   xmm2,   xmm0                    ; expanding to dwords
   1.557 +
   1.558 +            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
   1.559 +            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2
   1.560 +
   1.561 +            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
   1.562 +            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5
   1.563 +
   1.564 +            paddd       xmm6,   xmm2
   1.565 +            paddd       xmm7,   xmm1
   1.566 +
   1.567 +            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
   1.568 +            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones
   1.569 +
   1.570 +            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
   1.571 +            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000
   1.572 +
   1.573 +            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
   1.574 +            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared
   1.575 +
   1.576 +            paddd       xmm6,   xmm4
   1.577 +            paddd       xmm7,   xmm3
   1.578 +
   1.579 +            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
   1.580 +            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared
   1.581 +
   1.582 +            paddd       xmm7,   xmm3
   1.583 +            paddd       xmm6,   xmm4
   1.584 +
   1.585 +            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
   1.586 +            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared
   1.587 +
   1.588 +            paddd       xmm7,   xmm3
   1.589 +            paddd       xmm6,   xmm4
   1.590 +
   1.591 +            movdqa      xmm3,   xmm6
   1.592 +            pmaddwd     xmm3,   xmm3
   1.593 +
   1.594 +            movdqa      xmm5,   xmm7
   1.595 +            pslld       xmm5,   4
   1.596 +
   1.597 +            psubd       xmm5,   xmm7
   1.598 +            psubd       xmm5,   xmm3
   1.599 +
   1.600 +            psubd       xmm5,   flimit4
   1.601 +            psrad       xmm5,   31
   1.602 +
   1.603 +            packssdw    xmm5,   xmm0
   1.604 +            packsswb    xmm5,   xmm0
   1.605 +
   1.606 +            movd        xmm1,   DWORD PTR [rsi+rcx]
   1.607 +            movq        xmm2,   xmm1
   1.608 +
   1.609 +            punpcklbw   xmm1,   xmm0
   1.610 +            punpcklwd   xmm1,   xmm0
   1.611 +
   1.612 +            paddd       xmm1,   xmm6
   1.613 +            paddd       xmm1,   [GLOBAL(four8s)]
   1.614 +
   1.615 +            psrad       xmm1,   4
   1.616 +            packssdw    xmm1,   xmm0
   1.617 +
   1.618 +            packuswb    xmm1,   xmm0
   1.619 +            pand        xmm1,   xmm5
   1.620 +
   1.621 +            pandn       xmm5,   xmm2
   1.622 +            por         xmm5,   xmm1
   1.623 +
   1.624 +            movd        [rsi+rcx-8],  mm0
   1.625 +            movq        mm0,    mm1
   1.626 +
   1.627 +            movdq2q     mm1,    xmm5
   1.628 +            psrldq      xmm7,   12
   1.629 +
   1.630 +            psrldq      xmm6,   12
   1.631 +            add         rcx,    4
   1.632 +
   1.633 +            cmp         rcx,    rdx
   1.634 +            jl          .nextcol4
   1.635 +
   1.636 +        ;s+=pitch;
   1.637 +        movsxd rax, dword arg(1)
   1.638 +        add    arg(0), rax
   1.639 +
   1.640 +        sub dword arg(2), 1 ;rows-=1
   1.641 +        cmp dword arg(2), 0
   1.642 +        jg .ip_row_loop
   1.643 +
   1.644 +    add         rsp, 16
   1.645 +    pop         rsp
   1.646 +
   1.647 +    ; begin epilog
   1.648 +    pop rdi
   1.649 +    pop rsi
   1.650 +    RESTORE_GOT
   1.651 +    RESTORE_XMM
   1.652 +    UNSHADOW_ARGS
   1.653 +    pop         rbp
   1.654 +    ret
   1.655 +%undef flimit4
   1.656 +
   1.657 +
   1.658 +;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
   1.659 +;                            unsigned char blackclamp[16],
   1.660 +;                            unsigned char whiteclamp[16],
   1.661 +;                            unsigned char bothclamp[16],
   1.662 +;                            unsigned int Width, unsigned int Height, int Pitch)
   1.663 +extern sym(rand)
   1.664 +global sym(vp8_plane_add_noise_wmt) PRIVATE
   1.665 +sym(vp8_plane_add_noise_wmt):
   1.666 +    push        rbp
   1.667 +    mov         rbp, rsp
   1.668 +    SHADOW_ARGS_TO_STACK 8
   1.669 +    GET_GOT     rbx
   1.670 +    push        rsi
   1.671 +    push        rdi
   1.672 +    ; end prolog
   1.673 +
   1.674 +.addnoise_loop:
   1.675 +    call sym(rand) WRT_PLT
   1.676 +    mov     rcx, arg(1) ;noise
   1.677 +    and     rax, 0xff
   1.678 +    add     rcx, rax
   1.679 +
   1.680 +    ; we rely on the fact that the clamping vectors are stored contiguously
   1.681 +    ; in black/white/both order. Note that we have to reload this here because
   1.682 +    ; rdx could be trashed by rand()
   1.683 +    mov     rdx, arg(2) ; blackclamp
   1.684 +
   1.685 +
   1.686 +            mov     rdi, rcx
   1.687 +            movsxd  rcx, dword arg(5) ;[Width]
   1.688 +            mov     rsi, arg(0) ;Pos
   1.689 +            xor         rax,rax
   1.690 +
   1.691 +.addnoise_nextset:
   1.692 +            movdqu      xmm1,[rsi+rax]         ; get the source
   1.693 +
   1.694 +            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
   1.695 +            paddusb     xmm1, [rdx+32] ;bothclamp
   1.696 +            psubusb     xmm1, [rdx+16] ;whiteclamp
   1.697 +
   1.698 +            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
   1.699 +            paddb       xmm1,xmm2              ; add it in
   1.700 +            movdqu      [rsi+rax],xmm1         ; store the result
   1.701 +
   1.702 +            add         rax,16                 ; move to the next line
   1.703 +
   1.704 +            cmp         rax, rcx
   1.705 +            jl          .addnoise_nextset
   1.706 +
   1.707 +    movsxd  rax, dword arg(7) ; Pitch
   1.708 +    add     arg(0), rax ; Start += Pitch
   1.709 +    sub     dword arg(6), 1   ; Height -= 1
   1.710 +    jg      .addnoise_loop
   1.711 +
   1.712 +    ; begin epilog
   1.713 +    pop rdi
   1.714 +    pop rsi
   1.715 +    RESTORE_GOT
   1.716 +    UNSHADOW_ARGS
   1.717 +    pop         rbp
   1.718 +    ret
   1.719 +
   1.720 +
   1.721 +SECTION_RODATA
   1.722 +align 16
   1.723 +four8s:
   1.724 +    times 4 dd 8

mercurial