media/libvpx/vp8/common/x86/variance_impl_ssse3.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 %define xmm_filter_shift            7
    17 ;void vp8_filter_block2d_bil_var_ssse3
    18 ;(
    19 ;    unsigned char *ref_ptr,
    20 ;    int ref_pixels_per_line,
    21 ;    unsigned char *src_ptr,
    22 ;    int src_pixels_per_line,
    23 ;    unsigned int Height,
    24 ;    int  xoffset,
    25 ;    int  yoffset,
    26 ;    int *sum,
    27 ;    unsigned int *sumsquared;;
    28 ;
    29 ;)
    30 ;Note: The filter coefficient at offset=0 is 128. Since the second register
    31 ;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
    32 global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE
    33 sym(vp8_filter_block2d_bil_var_ssse3):
    34     push        rbp
    35     mov         rbp, rsp
    36     SHADOW_ARGS_TO_STACK 9
    37     SAVE_XMM 7
    38     GET_GOT     rbx
    39     push rsi
    40     push rdi
    41     ; end prolog
    43         pxor            xmm6,           xmm6
    44         pxor            xmm7,           xmm7
    46         lea             rcx,            [GLOBAL(vp8_bilinear_filters_ssse3)]
    47         movsxd          rax,            dword ptr arg(5)     ; xoffset
    49         cmp             rax,            0                    ; skip first_pass filter if xoffset=0
    50         je              .filter_block2d_bil_var_ssse3_sp_only
    52         shl             rax,            4                    ; point to filter coeff with xoffset
    53         lea             rax,            [rax + rcx]          ; HFilter
    55         movsxd          rdx,            dword ptr arg(6)     ; yoffset
    57         cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
    58         je              .filter_block2d_bil_var_ssse3_fp_only
    60         shl             rdx,            4
    61         lea             rdx,            [rdx + rcx]          ; VFilter
    63         mov             rsi,            arg(0)               ;ref_ptr
    64         mov             rdi,            arg(2)               ;src_ptr
    65         movsxd          rcx,            dword ptr arg(4)     ;Height
    67         movdqu          xmm0,           XMMWORD PTR [rsi]
    68         movdqu          xmm1,           XMMWORD PTR [rsi+1]
    69         movdqa          xmm2,           xmm0
    71         punpcklbw       xmm0,           xmm1
    72         punpckhbw       xmm2,           xmm1
    73         pmaddubsw       xmm0,           [rax]
    74         pmaddubsw       xmm2,           [rax]
    76         paddw           xmm0,           [GLOBAL(xmm_bi_rd)]
    77         paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
    78         psraw           xmm0,           xmm_filter_shift
    79         psraw           xmm2,           xmm_filter_shift
    81         packuswb        xmm0,           xmm2
    83 %if ABI_IS_32BIT
    84         add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
    85 %else
    86         movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
    87         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
    88         lea             rsi,            [rsi + r8]
    89 %endif
    91 .filter_block2d_bil_var_ssse3_loop:
    92         movdqu          xmm1,           XMMWORD PTR [rsi]
    93         movdqu          xmm2,           XMMWORD PTR [rsi+1]
    94         movdqa          xmm3,           xmm1
    96         punpcklbw       xmm1,           xmm2
    97         punpckhbw       xmm3,           xmm2
    98         pmaddubsw       xmm1,           [rax]
    99         pmaddubsw       xmm3,           [rax]
   101         paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
   102         paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
   103         psraw           xmm1,           xmm_filter_shift
   104         psraw           xmm3,           xmm_filter_shift
   105         packuswb        xmm1,           xmm3
   107         movdqa          xmm2,           xmm0
   108         movdqa          xmm0,           xmm1
   109         movdqa          xmm3,           xmm2
   111         punpcklbw       xmm2,           xmm1
   112         punpckhbw       xmm3,           xmm1
   113         pmaddubsw       xmm2,           [rdx]
   114         pmaddubsw       xmm3,           [rdx]
   116         paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
   117         paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
   118         psraw           xmm2,           xmm_filter_shift
   119         psraw           xmm3,           xmm_filter_shift
   121         movq            xmm1,           QWORD PTR [rdi]
   122         pxor            xmm4,           xmm4
   123         punpcklbw       xmm1,           xmm4
   124         movq            xmm5,           QWORD PTR [rdi+8]
   125         punpcklbw       xmm5,           xmm4
   127         psubw           xmm2,           xmm1
   128         psubw           xmm3,           xmm5
   129         paddw           xmm6,           xmm2
   130         paddw           xmm6,           xmm3
   131         pmaddwd         xmm2,           xmm2
   132         pmaddwd         xmm3,           xmm3
   133         paddd           xmm7,           xmm2
   134         paddd           xmm7,           xmm3
   136 %if ABI_IS_32BIT
   137         add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line
   138         add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
   139 %else
   140         lea             rsi,            [rsi + r8]
   141         lea             rdi,            [rdi + r9]
   142 %endif
   144         sub             rcx,            1
   145         jnz             .filter_block2d_bil_var_ssse3_loop
   147         jmp             .filter_block2d_bil_variance
   149 .filter_block2d_bil_var_ssse3_sp_only:
   150         movsxd          rdx,            dword ptr arg(6)     ; yoffset
   152         cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
   153         je              .filter_block2d_bil_var_ssse3_full_pixel
   155         shl             rdx,            4
   156         lea             rdx,            [rdx + rcx]          ; VFilter
   158         mov             rsi,            arg(0)               ;ref_ptr
   159         mov             rdi,            arg(2)               ;src_ptr
   160         movsxd          rcx,            dword ptr arg(4)     ;Height
   161         movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
   163         movdqu          xmm1,           XMMWORD PTR [rsi]
   164         movdqa          xmm0,           xmm1
   166 %if ABI_IS_32BIT=0
   167         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
   168 %endif
   170         lea             rsi,            [rsi + rax]
   172 .filter_block2d_bil_sp_only_loop:
   173         movdqu          xmm3,           XMMWORD PTR [rsi]
   174         movdqa          xmm2,           xmm1
   175         movdqa          xmm0,           xmm3
   177         punpcklbw       xmm1,           xmm3
   178         punpckhbw       xmm2,           xmm3
   179         pmaddubsw       xmm1,           [rdx]
   180         pmaddubsw       xmm2,           [rdx]
   182         paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
   183         paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
   184         psraw           xmm1,           xmm_filter_shift
   185         psraw           xmm2,           xmm_filter_shift
   187         movq            xmm3,           QWORD PTR [rdi]
   188         pxor            xmm4,           xmm4
   189         punpcklbw       xmm3,           xmm4
   190         movq            xmm5,           QWORD PTR [rdi+8]
   191         punpcklbw       xmm5,           xmm4
   193         psubw           xmm1,           xmm3
   194         psubw           xmm2,           xmm5
   195         paddw           xmm6,           xmm1
   196         paddw           xmm6,           xmm2
   197         pmaddwd         xmm1,           xmm1
   198         pmaddwd         xmm2,           xmm2
   199         paddd           xmm7,           xmm1
   200         paddd           xmm7,           xmm2
   202         movdqa          xmm1,           xmm0
   203         lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
   205 %if ABI_IS_32BIT
   206         add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
   207 %else
   208         lea             rdi,            [rdi + r9]
   209 %endif
   211         sub             rcx,            1
   212         jnz             .filter_block2d_bil_sp_only_loop
   214         jmp             .filter_block2d_bil_variance
   216 .filter_block2d_bil_var_ssse3_full_pixel:
   217         mov             rsi,            arg(0)               ;ref_ptr
   218         mov             rdi,            arg(2)               ;src_ptr
   219         movsxd          rcx,            dword ptr arg(4)     ;Height
   220         movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
   221         movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line
   222         pxor            xmm0,           xmm0
   224 .filter_block2d_bil_full_pixel_loop:
   225         movq            xmm1,           QWORD PTR [rsi]
   226         punpcklbw       xmm1,           xmm0
   227         movq            xmm2,           QWORD PTR [rsi+8]
   228         punpcklbw       xmm2,           xmm0
   230         movq            xmm3,           QWORD PTR [rdi]
   231         punpcklbw       xmm3,           xmm0
   232         movq            xmm4,           QWORD PTR [rdi+8]
   233         punpcklbw       xmm4,           xmm0
   235         psubw           xmm1,           xmm3
   236         psubw           xmm2,           xmm4
   237         paddw           xmm6,           xmm1
   238         paddw           xmm6,           xmm2
   239         pmaddwd         xmm1,           xmm1
   240         pmaddwd         xmm2,           xmm2
   241         paddd           xmm7,           xmm1
   242         paddd           xmm7,           xmm2
   244         lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
   245         lea             rdi,            [rdi + rdx]          ;src_pixels_per_line
   246         sub             rcx,            1
   247         jnz             .filter_block2d_bil_full_pixel_loop
   249         jmp             .filter_block2d_bil_variance
   251 .filter_block2d_bil_var_ssse3_fp_only:
   252         mov             rsi,            arg(0)               ;ref_ptr
   253         mov             rdi,            arg(2)               ;src_ptr
   254         movsxd          rcx,            dword ptr arg(4)     ;Height
   255         movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
   257         pxor            xmm0,           xmm0
   259 %if ABI_IS_32BIT=0
   260         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
   261 %endif
   263 .filter_block2d_bil_fp_only_loop:
   264         movdqu          xmm1,           XMMWORD PTR [rsi]
   265         movdqu          xmm2,           XMMWORD PTR [rsi+1]
   266         movdqa          xmm3,           xmm1
   268         punpcklbw       xmm1,           xmm2
   269         punpckhbw       xmm3,           xmm2
   270         pmaddubsw       xmm1,           [rax]
   271         pmaddubsw       xmm3,           [rax]
   273         paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
   274         paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
   275         psraw           xmm1,           xmm_filter_shift
   276         psraw           xmm3,           xmm_filter_shift
   278         movq            xmm2,           XMMWORD PTR [rdi]
   279         pxor            xmm4,           xmm4
   280         punpcklbw       xmm2,           xmm4
   281         movq            xmm5,           QWORD PTR [rdi+8]
   282         punpcklbw       xmm5,           xmm4
   284         psubw           xmm1,           xmm2
   285         psubw           xmm3,           xmm5
   286         paddw           xmm6,           xmm1
   287         paddw           xmm6,           xmm3
   288         pmaddwd         xmm1,           xmm1
   289         pmaddwd         xmm3,           xmm3
   290         paddd           xmm7,           xmm1
   291         paddd           xmm7,           xmm3
   293         lea             rsi,            [rsi + rdx]
   294 %if ABI_IS_32BIT
   295         add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
   296 %else
   297         lea             rdi,            [rdi + r9]
   298 %endif
   300         sub             rcx,            1
   301         jnz             .filter_block2d_bil_fp_only_loop
   303         jmp             .filter_block2d_bil_variance
   305 .filter_block2d_bil_variance:
   306         pxor        xmm0,           xmm0
   307         pxor        xmm1,           xmm1
   308         pxor        xmm5,           xmm5
   310         punpcklwd   xmm0,           xmm6
   311         punpckhwd   xmm1,           xmm6
   312         psrad       xmm0,           16
   313         psrad       xmm1,           16
   314         paddd       xmm0,           xmm1
   315         movdqa      xmm1,           xmm0
   317         movdqa      xmm6,           xmm7
   318         punpckldq   xmm6,           xmm5
   319         punpckhdq   xmm7,           xmm5
   320         paddd       xmm6,           xmm7
   322         punpckldq   xmm0,           xmm5
   323         punpckhdq   xmm1,           xmm5
   324         paddd       xmm0,           xmm1
   326         movdqa      xmm7,           xmm6
   327         movdqa      xmm1,           xmm0
   329         psrldq      xmm7,           8
   330         psrldq      xmm1,           8
   332         paddd       xmm6,           xmm7
   333         paddd       xmm0,           xmm1
   335         mov         rsi,            arg(7) ;[Sum]
   336         mov         rdi,            arg(8) ;[SSE]
   338         movd        [rsi],       xmm0
   339         movd        [rdi],       xmm6
   341     ; begin epilog
   342     pop rdi
   343     pop rsi
   344     RESTORE_GOT
   345     RESTORE_XMM
   346     UNSHADOW_ARGS
   347     pop         rbp
   348     ret
   351 SECTION_RODATA
   352 align 16
   353 xmm_bi_rd:
   354     times 8 dw 64
   355 align 16
   356 vp8_bilinear_filters_ssse3:
   357     times 8 db 128, 0
   358     times 8 db 112, 16
   359     times 8 db 96,  32
   360     times 8 db 80,  48
   361     times 8 db 64,  64
   362     times 8 db 48,  80
   363     times 8 db 32,  96
   364     times 8 db 16,  112

mercurial