media/libvpx/vp8/common/x86/mfqe_sse2.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 ;void vp8_filter_by_weight16x16_sse2
    15 ;(
    16 ;    unsigned char *src,
    17 ;    int            src_stride,
    18 ;    unsigned char *dst,
    19 ;    int            dst_stride,
    20 ;    int            src_weight
    21 ;)
    22 global sym(vp8_filter_by_weight16x16_sse2) PRIVATE
    23 sym(vp8_filter_by_weight16x16_sse2):
    24     push        rbp
    25     mov         rbp, rsp
    26     SHADOW_ARGS_TO_STACK 5
    27     SAVE_XMM 6
    28     GET_GOT     rbx
    29     push        rsi
    30     push        rdi
    31     ; end prolog
    33     movd        xmm0, arg(4)                ; src_weight
    34     pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
    35     punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
    37     movdqa      xmm1, [GLOBAL(tMFQE)]
    38     psubw       xmm1, xmm0                  ; dst_weight
    40     mov         rax, arg(0)                 ; src
    41     mov         rsi, arg(1)                 ; src_stride
    42     mov         rdx, arg(2)                 ; dst
    43     mov         rdi, arg(3)                 ; dst_stride
    45     mov         rcx, 16                     ; loop count
    46     pxor        xmm6, xmm6
    48 .combine
    49     movdqa      xmm2, [rax]
    50     movdqa      xmm4, [rdx]
    51     add         rax, rsi
    53     ; src * src_weight
    54     movdqa      xmm3, xmm2
    55     punpcklbw   xmm2, xmm6
    56     punpckhbw   xmm3, xmm6
    57     pmullw      xmm2, xmm0
    58     pmullw      xmm3, xmm0
    60     ; dst * dst_weight
    61     movdqa      xmm5, xmm4
    62     punpcklbw   xmm4, xmm6
    63     punpckhbw   xmm5, xmm6
    64     pmullw      xmm4, xmm1
    65     pmullw      xmm5, xmm1
    67     ; sum, round and shift
    68     paddw       xmm2, xmm4
    69     paddw       xmm3, xmm5
    70     paddw       xmm2, [GLOBAL(tMFQE_round)]
    71     paddw       xmm3, [GLOBAL(tMFQE_round)]
    72     psrlw       xmm2, 4
    73     psrlw       xmm3, 4
    75     packuswb    xmm2, xmm3
    76     movdqa      [rdx], xmm2
    77     add         rdx, rdi
    79     dec         rcx
    80     jnz         .combine
    82     ; begin epilog
    83     pop         rdi
    84     pop         rsi
    85     RESTORE_GOT
    86     RESTORE_XMM
    87     UNSHADOW_ARGS
    88     pop         rbp
    90     ret
    92 ;void vp8_filter_by_weight8x8_sse2
    93 ;(
    94 ;    unsigned char *src,
    95 ;    int            src_stride,
    96 ;    unsigned char *dst,
    97 ;    int            dst_stride,
    98 ;    int            src_weight
    99 ;)
   100 global sym(vp8_filter_by_weight8x8_sse2) PRIVATE
   101 sym(vp8_filter_by_weight8x8_sse2):
   102     push        rbp
   103     mov         rbp, rsp
   104     SHADOW_ARGS_TO_STACK 5
   105     GET_GOT     rbx
   106     push        rsi
   107     push        rdi
   108     ; end prolog
   110     movd        xmm0, arg(4)                ; src_weight
   111     pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
   112     punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
   114     movdqa      xmm1, [GLOBAL(tMFQE)]
   115     psubw       xmm1, xmm0                  ; dst_weight
   117     mov         rax, arg(0)                 ; src
   118     mov         rsi, arg(1)                 ; src_stride
   119     mov         rdx, arg(2)                 ; dst
   120     mov         rdi, arg(3)                 ; dst_stride
   122     mov         rcx, 8                      ; loop count
   123     pxor        xmm4, xmm4
   125 .combine
   126     movq        xmm2, [rax]
   127     movq        xmm3, [rdx]
   128     add         rax, rsi
   130     ; src * src_weight
   131     punpcklbw   xmm2, xmm4
   132     pmullw      xmm2, xmm0
   134     ; dst * dst_weight
   135     punpcklbw   xmm3, xmm4
   136     pmullw      xmm3, xmm1
   138     ; sum, round and shift
   139     paddw       xmm2, xmm3
   140     paddw       xmm2, [GLOBAL(tMFQE_round)]
   141     psrlw       xmm2, 4
   143     packuswb    xmm2, xmm4
   144     movq        [rdx], xmm2
   145     add         rdx, rdi
   147     dec         rcx
   148     jnz         .combine
   150     ; begin epilog
   151     pop         rdi
   152     pop         rsi
   153     RESTORE_GOT
   154     UNSHADOW_ARGS
   155     pop         rbp
   157     ret
   159 ;void vp8_variance_and_sad_16x16_sse2 | arg
   160 ;(
   161 ;    unsigned char *src1,          0
   162 ;    int            stride1,       1
   163 ;    unsigned char *src2,          2
   164 ;    int            stride2,       3
   165 ;    unsigned int  *variance,      4
   166 ;    unsigned int  *sad,           5
   167 ;)
   168 global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE
   169 sym(vp8_variance_and_sad_16x16_sse2):
   170     push        rbp
   171     mov         rbp, rsp
   172     SHADOW_ARGS_TO_STACK 6
   173     GET_GOT     rbx
   174     push        rsi
   175     push        rdi
   176     ; end prolog
   178     mov         rax,        arg(0)          ; src1
   179     mov         rcx,        arg(1)          ; stride1
   180     mov         rdx,        arg(2)          ; src2
   181     mov         rdi,        arg(3)          ; stride2
   183     mov         rsi,        16              ; block height
   185     ; Prep accumulator registers
   186     pxor        xmm3, xmm3                  ; SAD
   187     pxor        xmm4, xmm4                  ; sum of src2
   188     pxor        xmm5, xmm5                  ; sum of src2^2
   190     ; Because we're working with the actual output frames
   191     ; we can't depend on any kind of data alignment.
   192 .accumulate
   193     movdqa      xmm0, [rax]                 ; src1
   194     movdqa      xmm1, [rdx]                 ; src2
   195     add         rax, rcx                    ; src1 + stride1
   196     add         rdx, rdi                    ; src2 + stride2
   198     ; SAD(src1, src2)
   199     psadbw      xmm0, xmm1
   200     paddusw     xmm3, xmm0
   202     ; SUM(src2)
   203     pxor        xmm2, xmm2
   204     psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
   205     paddusw     xmm4, xmm2
   207     ; pmaddubsw would be ideal if it took two unsigned values. instead,
   208     ; it expects a signed and an unsigned value. so instead we zero extend
   209     ; and operate on words.
   210     pxor        xmm2, xmm2
   211     movdqa      xmm0, xmm1
   212     punpcklbw   xmm0, xmm2
   213     punpckhbw   xmm1, xmm2
   214     pmaddwd     xmm0, xmm0
   215     pmaddwd     xmm1, xmm1
   216     paddd       xmm5, xmm0
   217     paddd       xmm5, xmm1
   219     sub         rsi,        1
   220     jnz         .accumulate
   222     ; phaddd only operates on adjacent double words.
   223     ; Finalize SAD and store
   224     movdqa      xmm0, xmm3
   225     psrldq      xmm0, 8
   226     paddusw     xmm0, xmm3
   227     paddd       xmm0, [GLOBAL(t128)]
   228     psrld       xmm0, 8
   230     mov         rax,  arg(5)
   231     movd        [rax], xmm0
   233     ; Accumulate sum of src2
   234     movdqa      xmm0, xmm4
   235     psrldq      xmm0, 8
   236     paddusw     xmm0, xmm4
   237     ; Square src2. Ignore high value
   238     pmuludq     xmm0, xmm0
   239     psrld       xmm0, 8
   241     ; phaddw could be used to sum adjacent values but we want
   242     ; all the values summed. promote to doubles, accumulate,
   243     ; shift and sum
   244     pxor        xmm2, xmm2
   245     movdqa      xmm1, xmm5
   246     punpckldq   xmm1, xmm2
   247     punpckhdq   xmm5, xmm2
   248     paddd       xmm1, xmm5
   249     movdqa      xmm2, xmm1
   250     psrldq      xmm1, 8
   251     paddd       xmm1, xmm2
   253     psubd       xmm1, xmm0
   255     ; (variance + 128) >> 8
   256     paddd       xmm1, [GLOBAL(t128)]
   257     psrld       xmm1, 8
   258     mov         rax,  arg(4)
   260     movd        [rax], xmm1
   263     ; begin epilog
   264     pop         rdi
   265     pop         rsi
   266     RESTORE_GOT
   267     UNSHADOW_ARGS
   268     pop         rbp
   269     ret
   271 SECTION_RODATA
   272 align 16
   273 t128:
   274 %ifndef __NASM_VER__
   275     ddq 128
   276 %elif CONFIG_BIG_ENDIAN
   277     dq  0, 128
   278 %else
   279     dq  128, 0
   280 %endif
   281 align 16
   282 tMFQE: ; 1 << MFQE_PRECISION
   283     times 8 dw 0x10
   284 align 16
   285 tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
   286     times 8 dw 0x08

mercurial