media/libvpx/vp8/common/x86/sad_ssse3.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 %macro PROCESS_16X2X3 1
    15 %if %1
    16         movdqa          xmm0,       XMMWORD PTR [rsi]
    17         lddqu           xmm5,       XMMWORD PTR [rdi]
    18         lddqu           xmm6,       XMMWORD PTR [rdi+1]
    19         lddqu           xmm7,       XMMWORD PTR [rdi+2]
    21         psadbw          xmm5,       xmm0
    22         psadbw          xmm6,       xmm0
    23         psadbw          xmm7,       xmm0
    24 %else
    25         movdqa          xmm0,       XMMWORD PTR [rsi]
    26         lddqu           xmm1,       XMMWORD PTR [rdi]
    27         lddqu           xmm2,       XMMWORD PTR [rdi+1]
    28         lddqu           xmm3,       XMMWORD PTR [rdi+2]
    30         psadbw          xmm1,       xmm0
    31         psadbw          xmm2,       xmm0
    32         psadbw          xmm3,       xmm0
    34         paddw           xmm5,       xmm1
    35         paddw           xmm6,       xmm2
    36         paddw           xmm7,       xmm3
    37 %endif
    38         movdqa          xmm0,       XMMWORD PTR [rsi+rax]
    39         lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
    40         lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
    41         lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
    43         lea             rsi,        [rsi+rax*2]
    44         lea             rdi,        [rdi+rdx*2]
    46         psadbw          xmm1,       xmm0
    47         psadbw          xmm2,       xmm0
    48         psadbw          xmm3,       xmm0
    50         paddw           xmm5,       xmm1
    51         paddw           xmm6,       xmm2
    52         paddw           xmm7,       xmm3
    53 %endmacro
    55 %macro PROCESS_16X2X3_OFFSET 2
    56 %if %1
    57         movdqa          xmm0,       XMMWORD PTR [rsi]
    58         movdqa          xmm4,       XMMWORD PTR [rdi]
    59         movdqa          xmm7,       XMMWORD PTR [rdi+16]
    61         movdqa          xmm5,       xmm7
    62         palignr         xmm5,       xmm4,       %2
    64         movdqa          xmm6,       xmm7
    65         palignr         xmm6,       xmm4,       (%2+1)
    67         palignr         xmm7,       xmm4,       (%2+2)
    69         psadbw          xmm5,       xmm0
    70         psadbw          xmm6,       xmm0
    71         psadbw          xmm7,       xmm0
    72 %else
    73         movdqa          xmm0,       XMMWORD PTR [rsi]
    74         movdqa          xmm4,       XMMWORD PTR [rdi]
    75         movdqa          xmm3,       XMMWORD PTR [rdi+16]
    77         movdqa          xmm1,       xmm3
    78         palignr         xmm1,       xmm4,       %2
    80         movdqa          xmm2,       xmm3
    81         palignr         xmm2,       xmm4,       (%2+1)
    83         palignr         xmm3,       xmm4,       (%2+2)
    85         psadbw          xmm1,       xmm0
    86         psadbw          xmm2,       xmm0
    87         psadbw          xmm3,       xmm0
    89         paddw           xmm5,       xmm1
    90         paddw           xmm6,       xmm2
    91         paddw           xmm7,       xmm3
    92 %endif
    93         movdqa          xmm0,       XMMWORD PTR [rsi+rax]
    94         movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
    95         movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
    97         movdqa          xmm1,       xmm3
    98         palignr         xmm1,       xmm4,       %2
   100         movdqa          xmm2,       xmm3
   101         palignr         xmm2,       xmm4,       (%2+1)
   103         palignr         xmm3,       xmm4,       (%2+2)
   105         lea             rsi,        [rsi+rax*2]
   106         lea             rdi,        [rdi+rdx*2]
   108         psadbw          xmm1,       xmm0
   109         psadbw          xmm2,       xmm0
   110         psadbw          xmm3,       xmm0
   112         paddw           xmm5,       xmm1
   113         paddw           xmm6,       xmm2
   114         paddw           xmm7,       xmm3
   115 %endmacro
   117 %macro PROCESS_16X16X3_OFFSET 2
   118 %2_aligned_by_%1:
   120         sub             rdi,        %1
   122         PROCESS_16X2X3_OFFSET 1, %1
   123         PROCESS_16X2X3_OFFSET 0, %1
   124         PROCESS_16X2X3_OFFSET 0, %1
   125         PROCESS_16X2X3_OFFSET 0, %1
   126         PROCESS_16X2X3_OFFSET 0, %1
   127         PROCESS_16X2X3_OFFSET 0, %1
   128         PROCESS_16X2X3_OFFSET 0, %1
   129         PROCESS_16X2X3_OFFSET 0, %1
   131         jmp             %2_store_off
   133 %endmacro
   135 %macro PROCESS_16X8X3_OFFSET 2
   136 %2_aligned_by_%1:
   138         sub             rdi,        %1
   140         PROCESS_16X2X3_OFFSET 1, %1
   141         PROCESS_16X2X3_OFFSET 0, %1
   142         PROCESS_16X2X3_OFFSET 0, %1
   143         PROCESS_16X2X3_OFFSET 0, %1
   145         jmp             %2_store_off
   147 %endmacro
   149 ;void int vp8_sad16x16x3_ssse3(
   150 ;    unsigned char *src_ptr,
   151 ;    int  src_stride,
   152 ;    unsigned char *ref_ptr,
   153 ;    int  ref_stride,
   154 ;    int  *results)
   155 global sym(vp8_sad16x16x3_ssse3) PRIVATE
   156 sym(vp8_sad16x16x3_ssse3):
   157     push        rbp
   158     mov         rbp, rsp
   159     SHADOW_ARGS_TO_STACK 5
   160     SAVE_XMM 7
   161     push        rsi
   162     push        rdi
   163     push        rcx
   164     ; end prolog
   166         mov             rsi,        arg(0) ;src_ptr
   167         mov             rdi,        arg(2) ;ref_ptr
   169         mov             rdx,        0xf
   170         and             rdx,        rdi
   172         jmp .vp8_sad16x16x3_ssse3_skiptable
   173 .vp8_sad16x16x3_ssse3_jumptable:
   174         dd .vp8_sad16x16x3_ssse3_aligned_by_0  - .vp8_sad16x16x3_ssse3_do_jump
   175         dd .vp8_sad16x16x3_ssse3_aligned_by_1  - .vp8_sad16x16x3_ssse3_do_jump
   176         dd .vp8_sad16x16x3_ssse3_aligned_by_2  - .vp8_sad16x16x3_ssse3_do_jump
   177         dd .vp8_sad16x16x3_ssse3_aligned_by_3  - .vp8_sad16x16x3_ssse3_do_jump
   178         dd .vp8_sad16x16x3_ssse3_aligned_by_4  - .vp8_sad16x16x3_ssse3_do_jump
   179         dd .vp8_sad16x16x3_ssse3_aligned_by_5  - .vp8_sad16x16x3_ssse3_do_jump
   180         dd .vp8_sad16x16x3_ssse3_aligned_by_6  - .vp8_sad16x16x3_ssse3_do_jump
   181         dd .vp8_sad16x16x3_ssse3_aligned_by_7  - .vp8_sad16x16x3_ssse3_do_jump
   182         dd .vp8_sad16x16x3_ssse3_aligned_by_8  - .vp8_sad16x16x3_ssse3_do_jump
   183         dd .vp8_sad16x16x3_ssse3_aligned_by_9  - .vp8_sad16x16x3_ssse3_do_jump
   184         dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump
   185         dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump
   186         dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump
   187         dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump
   188         dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump
   189         dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump
   190 .vp8_sad16x16x3_ssse3_skiptable:
   192         call .vp8_sad16x16x3_ssse3_do_jump
   193 .vp8_sad16x16x3_ssse3_do_jump:
   194         pop             rcx                         ; get the address of do_jump
   195         mov             rax,  .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump
   196         add             rax,  rcx  ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
   198         movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
   199         add             rcx,        rax
   201         movsxd          rax,        dword ptr arg(1) ;src_stride
   202         movsxd          rdx,        dword ptr arg(3) ;ref_stride
   204         jmp             rcx
   206         PROCESS_16X16X3_OFFSET 0,  .vp8_sad16x16x3_ssse3
   207         PROCESS_16X16X3_OFFSET 1,  .vp8_sad16x16x3_ssse3
   208         PROCESS_16X16X3_OFFSET 2,  .vp8_sad16x16x3_ssse3
   209         PROCESS_16X16X3_OFFSET 3,  .vp8_sad16x16x3_ssse3
   210         PROCESS_16X16X3_OFFSET 4,  .vp8_sad16x16x3_ssse3
   211         PROCESS_16X16X3_OFFSET 5,  .vp8_sad16x16x3_ssse3
   212         PROCESS_16X16X3_OFFSET 6,  .vp8_sad16x16x3_ssse3
   213         PROCESS_16X16X3_OFFSET 7,  .vp8_sad16x16x3_ssse3
   214         PROCESS_16X16X3_OFFSET 8,  .vp8_sad16x16x3_ssse3
   215         PROCESS_16X16X3_OFFSET 9,  .vp8_sad16x16x3_ssse3
   216         PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3
   217         PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3
   218         PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3
   219         PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3
   220         PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3
   222 .vp8_sad16x16x3_ssse3_aligned_by_15:
   223         PROCESS_16X2X3 1
   224         PROCESS_16X2X3 0
   225         PROCESS_16X2X3 0
   226         PROCESS_16X2X3 0
   227         PROCESS_16X2X3 0
   228         PROCESS_16X2X3 0
   229         PROCESS_16X2X3 0
   230         PROCESS_16X2X3 0
   232 .vp8_sad16x16x3_ssse3_store_off:
   233         mov             rdi,        arg(4) ;Results
   235         movq            xmm0,       xmm5
   236         psrldq          xmm5,       8
   238         paddw           xmm0,       xmm5
   239         movd            [rdi],      xmm0
   240 ;-
   241         movq            xmm0,       xmm6
   242         psrldq          xmm6,       8
   244         paddw           xmm0,       xmm6
   245         movd            [rdi+4],    xmm0
   246 ;-
   247         movq            xmm0,       xmm7
   248         psrldq          xmm7,       8
   250         paddw           xmm0,       xmm7
   251         movd            [rdi+8],    xmm0
   253     ; begin epilog
   254     pop         rcx
   255     pop         rdi
   256     pop         rsi
   257     RESTORE_XMM
   258     UNSHADOW_ARGS
   259     pop         rbp
   260     ret
   262 ;void int vp8_sad16x8x3_ssse3(
   263 ;    unsigned char *src_ptr,
   264 ;    int  src_stride,
   265 ;    unsigned char *ref_ptr,
   266 ;    int  ref_stride,
   267 ;    int  *results)
   268 global sym(vp8_sad16x8x3_ssse3) PRIVATE
   269 sym(vp8_sad16x8x3_ssse3):
   270     push        rbp
   271     mov         rbp, rsp
   272     SHADOW_ARGS_TO_STACK 5
   273     SAVE_XMM 7
   274     push        rsi
   275     push        rdi
   276     push        rcx
   277     ; end prolog
   279         mov             rsi,        arg(0) ;src_ptr
   280         mov             rdi,        arg(2) ;ref_ptr
   282         mov             rdx,        0xf
   283         and             rdx,        rdi
   285         jmp .vp8_sad16x8x3_ssse3_skiptable
   286 .vp8_sad16x8x3_ssse3_jumptable:
   287         dd .vp8_sad16x8x3_ssse3_aligned_by_0  - .vp8_sad16x8x3_ssse3_do_jump
   288         dd .vp8_sad16x8x3_ssse3_aligned_by_1  - .vp8_sad16x8x3_ssse3_do_jump
   289         dd .vp8_sad16x8x3_ssse3_aligned_by_2  - .vp8_sad16x8x3_ssse3_do_jump
   290         dd .vp8_sad16x8x3_ssse3_aligned_by_3  - .vp8_sad16x8x3_ssse3_do_jump
   291         dd .vp8_sad16x8x3_ssse3_aligned_by_4  - .vp8_sad16x8x3_ssse3_do_jump
   292         dd .vp8_sad16x8x3_ssse3_aligned_by_5  - .vp8_sad16x8x3_ssse3_do_jump
   293         dd .vp8_sad16x8x3_ssse3_aligned_by_6  - .vp8_sad16x8x3_ssse3_do_jump
   294         dd .vp8_sad16x8x3_ssse3_aligned_by_7  - .vp8_sad16x8x3_ssse3_do_jump
   295         dd .vp8_sad16x8x3_ssse3_aligned_by_8  - .vp8_sad16x8x3_ssse3_do_jump
   296         dd .vp8_sad16x8x3_ssse3_aligned_by_9  - .vp8_sad16x8x3_ssse3_do_jump
   297         dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump
   298         dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump
   299         dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump
   300         dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump
   301         dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump
   302         dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump
   303 .vp8_sad16x8x3_ssse3_skiptable:
   305         call .vp8_sad16x8x3_ssse3_do_jump
   306 .vp8_sad16x8x3_ssse3_do_jump:
   307         pop             rcx                         ; get the address of do_jump
   308         mov             rax,  .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump
   309         add             rax,  rcx  ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
   311         movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
   312         add             rcx,        rax
   314         movsxd          rax,        dword ptr arg(1) ;src_stride
   315         movsxd          rdx,        dword ptr arg(3) ;ref_stride
   317         jmp             rcx
   319         PROCESS_16X8X3_OFFSET 0,  .vp8_sad16x8x3_ssse3
   320         PROCESS_16X8X3_OFFSET 1,  .vp8_sad16x8x3_ssse3
   321         PROCESS_16X8X3_OFFSET 2,  .vp8_sad16x8x3_ssse3
   322         PROCESS_16X8X3_OFFSET 3,  .vp8_sad16x8x3_ssse3
   323         PROCESS_16X8X3_OFFSET 4,  .vp8_sad16x8x3_ssse3
   324         PROCESS_16X8X3_OFFSET 5,  .vp8_sad16x8x3_ssse3
   325         PROCESS_16X8X3_OFFSET 6,  .vp8_sad16x8x3_ssse3
   326         PROCESS_16X8X3_OFFSET 7,  .vp8_sad16x8x3_ssse3
   327         PROCESS_16X8X3_OFFSET 8,  .vp8_sad16x8x3_ssse3
   328         PROCESS_16X8X3_OFFSET 9,  .vp8_sad16x8x3_ssse3
   329         PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3
   330         PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3
   331         PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3
   332         PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3
   333         PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3
   335 .vp8_sad16x8x3_ssse3_aligned_by_15:
   337         PROCESS_16X2X3 1
   338         PROCESS_16X2X3 0
   339         PROCESS_16X2X3 0
   340         PROCESS_16X2X3 0
   342 .vp8_sad16x8x3_ssse3_store_off:
   343         mov             rdi,        arg(4) ;Results
   345         movq            xmm0,       xmm5
   346         psrldq          xmm5,       8
   348         paddw           xmm0,       xmm5
   349         movd            [rdi],      xmm0
   350 ;-
   351         movq            xmm0,       xmm6
   352         psrldq          xmm6,       8
   354         paddw           xmm0,       xmm6
   355         movd            [rdi+4],    xmm0
   356 ;-
   357         movq            xmm0,       xmm7
   358         psrldq          xmm7,       8
   360         paddw           xmm0,       xmm7
   361         movd            [rdi+8],    xmm0
   363     ; begin epilog
   364     pop         rcx
   365     pop         rdi
   366     pop         rsi
   367     RESTORE_XMM
   368     UNSHADOW_ARGS
   369     pop         rbp
   370     ret

mercurial