media/libvpx/vp8/common/x86/recon_sse2.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 ;void copy_mem16x16_sse2(
    15 ;    unsigned char *src,
    16 ;    int src_stride,
    17 ;    unsigned char *dst,
    18 ;    int dst_stride
    19 ;    )
    20 global sym(vp8_copy_mem16x16_sse2) PRIVATE
    21 sym(vp8_copy_mem16x16_sse2):
    22     push        rbp
    23     mov         rbp, rsp
    24     SHADOW_ARGS_TO_STACK 4
    25     push        rsi
    26     push        rdi
    27     ; end prolog
    29         mov         rsi,        arg(0) ;src;
    30         movdqu      xmm0,       [rsi]
    32         movsxd      rax,        dword ptr arg(1) ;src_stride;
    33         mov         rdi,        arg(2) ;dst;
    35         movdqu      xmm1,       [rsi+rax]
    36         movdqu      xmm2,       [rsi+rax*2]
    38         movsxd      rcx,        dword ptr arg(3) ;dst_stride
    39         lea         rsi,        [rsi+rax*2]
    41         movdqa      [rdi],      xmm0
    42         add         rsi,        rax
    44         movdqa      [rdi+rcx],  xmm1
    45         movdqa      [rdi+rcx*2],xmm2
    47         lea         rdi,        [rdi+rcx*2]
    48         movdqu      xmm3,       [rsi]
    50         add         rdi,        rcx
    51         movdqu      xmm4,       [rsi+rax]
    53         movdqu      xmm5,       [rsi+rax*2]
    54         lea         rsi,        [rsi+rax*2]
    56         movdqa      [rdi],  xmm3
    57         add         rsi,        rax
    59         movdqa      [rdi+rcx],  xmm4
    60         movdqa      [rdi+rcx*2],xmm5
    62         lea         rdi,        [rdi+rcx*2]
    63         movdqu      xmm0,       [rsi]
    65         add         rdi,        rcx
    66         movdqu      xmm1,       [rsi+rax]
    68         movdqu      xmm2,       [rsi+rax*2]
    69         lea         rsi,        [rsi+rax*2]
    71         movdqa      [rdi],      xmm0
    72         add         rsi,        rax
    74         movdqa      [rdi+rcx],  xmm1
    76         movdqa      [rdi+rcx*2],    xmm2
    77         movdqu      xmm3,       [rsi]
    79         movdqu      xmm4,       [rsi+rax]
    80         lea         rdi,        [rdi+rcx*2]
    82         add         rdi,        rcx
    83         movdqu      xmm5,       [rsi+rax*2]
    85         lea         rsi,        [rsi+rax*2]
    86         movdqa      [rdi],  xmm3
    88         add         rsi,        rax
    89         movdqa      [rdi+rcx],  xmm4
    91         movdqa      [rdi+rcx*2],xmm5
    92         movdqu      xmm0,       [rsi]
    94         lea         rdi,        [rdi+rcx*2]
    95         movdqu      xmm1,       [rsi+rax]
    97         add         rdi,        rcx
    98         movdqu      xmm2,       [rsi+rax*2]
   100         lea         rsi,        [rsi+rax*2]
   101         movdqa      [rdi],      xmm0
   103         movdqa      [rdi+rcx],  xmm1
   104         movdqa      [rdi+rcx*2],xmm2
   106         movdqu      xmm3,       [rsi+rax]
   107         lea         rdi,        [rdi+rcx*2]
   109         movdqa      [rdi+rcx],  xmm3
   111     ; begin epilog
   112     pop rdi
   113     pop rsi
   114     UNSHADOW_ARGS
   115     pop         rbp
   116     ret
   119 ;void vp8_intra_pred_uv_dc_mmx2(
   120 ;    unsigned char *dst,
   121 ;    int dst_stride
   122 ;    unsigned char *above,
   123 ;    unsigned char *left,
   124 ;    int left_stride,
   125 ;    )
   126 global sym(vp8_intra_pred_uv_dc_mmx2) PRIVATE
   127 sym(vp8_intra_pred_uv_dc_mmx2):
   128     push        rbp
   129     mov         rbp, rsp
   130     SHADOW_ARGS_TO_STACK 5
   131     push        rsi
   132     push        rdi
   133     ; end prolog
   135     ; from top
   136     mov         rdi,        arg(2) ;above;
   137     mov         rsi,        arg(3) ;left;
   138     movsxd      rax,        dword ptr arg(4) ;left_stride;
   139     pxor        mm0,        mm0
   140     movq        mm1,        [rdi]
   141     lea         rdi,        [rax*3]
   142     psadbw      mm1,        mm0
   143     ; from left
   144     movzx       ecx,        byte [rsi]
   145     movzx       edx,        byte [rsi+rax*1]
   146     add         ecx,        edx
   147     movzx       edx,        byte [rsi+rax*2]
   148     add         ecx,        edx
   150     movzx       edx,        byte [rsi+rdi]
   151     lea         rsi,        [rsi+rax*4]
   152     add         ecx,        edx
   153     movzx       edx,        byte [rsi]
   154     add         ecx,        edx
   155     movzx       edx,        byte [rsi+rax]
   156     add         ecx,        edx
   157     movzx       edx,        byte [rsi+rax*2]
   158     add         ecx,        edx
   159     movzx       edx,        byte [rsi+rdi]
   160     add         ecx,        edx
   162     ; add up
   163     pextrw      edx,        mm1, 0x0
   164     lea         edx,        [edx+ecx+8]
   165     sar         edx,        4
   166     movd        mm1,        edx
   167     movsxd      rcx,        dword ptr arg(1) ;dst_stride
   168     pshufw      mm1,        mm1, 0x0
   169     mov         rdi,        arg(0) ;dst;
   170     packuswb    mm1,        mm1
   172     ; write out
   173     lea         rax,        [rcx*3]
   174     lea         rdx,        [rdi+rcx*4]
   176     movq [rdi      ],       mm1
   177     movq [rdi+rcx  ],       mm1
   178     movq [rdi+rcx*2],       mm1
   179     movq [rdi+rax  ],       mm1
   180     movq [rdx      ],       mm1
   181     movq [rdx+rcx  ],       mm1
   182     movq [rdx+rcx*2],       mm1
   183     movq [rdx+rax  ],       mm1
   185     ; begin epilog
   186     pop         rdi
   187     pop         rsi
   188     UNSHADOW_ARGS
   189     pop         rbp
   190     ret
   192 ;void vp8_intra_pred_uv_dctop_mmx2(
   193 ;    unsigned char *dst,
   194 ;    int dst_stride
   195 ;    unsigned char *above,
   196 ;    unsigned char *left,
   197 ;    int left_stride,
   198 ;    )
   199 global sym(vp8_intra_pred_uv_dctop_mmx2) PRIVATE
   200 sym(vp8_intra_pred_uv_dctop_mmx2):
   201     push        rbp
   202     mov         rbp, rsp
   203     SHADOW_ARGS_TO_STACK 5
   204     GET_GOT     rbx
   205     push        rsi
   206     push        rdi
   207     ; end prolog
   209     ;arg(3), arg(4) not used
   211     ; from top
   212     mov         rsi,        arg(2) ;above;
   213     pxor        mm0,        mm0
   214     movq        mm1,        [rsi]
   215     psadbw      mm1,        mm0
   217     ; add up
   218     paddw       mm1,        [GLOBAL(dc_4)]
   219     psraw       mm1,        3
   220     pshufw      mm1,        mm1, 0x0
   221     packuswb    mm1,        mm1
   223     ; write out
   224     mov         rdi,        arg(0) ;dst;
   225     movsxd      rcx,        dword ptr arg(1) ;dst_stride
   226     lea         rax,        [rcx*3]
   228     movq [rdi      ],       mm1
   229     movq [rdi+rcx  ],       mm1
   230     movq [rdi+rcx*2],       mm1
   231     movq [rdi+rax  ],       mm1
   232     lea         rdi,        [rdi+rcx*4]
   233     movq [rdi      ],       mm1
   234     movq [rdi+rcx  ],       mm1
   235     movq [rdi+rcx*2],       mm1
   236     movq [rdi+rax  ],       mm1
   238     ; begin epilog
   239     pop         rdi
   240     pop         rsi
   241     RESTORE_GOT
   242     UNSHADOW_ARGS
   243     pop         rbp
   244     ret
   246 ;void vp8_intra_pred_uv_dcleft_mmx2(
   247 ;    unsigned char *dst,
   248 ;    int dst_stride
   249 ;    unsigned char *above,
   250 ;    unsigned char *left,
   251 ;    int left_stride,
   252 ;    )
   253 global sym(vp8_intra_pred_uv_dcleft_mmx2) PRIVATE
   254 sym(vp8_intra_pred_uv_dcleft_mmx2):
   255     push        rbp
   256     mov         rbp, rsp
   257     SHADOW_ARGS_TO_STACK 5
   258     push        rsi
   259     push        rdi
   260     ; end prolog
   262     ;arg(2) not used
   264     ; from left
   265     mov         rsi,        arg(3) ;left;
   266     movsxd      rax,        dword ptr arg(4) ;left_stride;
   267     lea         rdi,        [rax*3]
   268     movzx       ecx,        byte [rsi]
   269     movzx       edx,        byte [rsi+rax]
   270     add         ecx,        edx
   271     movzx       edx,        byte [rsi+rax*2]
   272     add         ecx,        edx
   273     movzx       edx,        byte [rsi+rdi]
   274     add         ecx,        edx
   275     lea         rsi,        [rsi+rax*4]
   276     movzx       edx,        byte [rsi]
   277     add         ecx,        edx
   278     movzx       edx,        byte [rsi+rax]
   279     add         ecx,        edx
   280     movzx       edx,        byte [rsi+rax*2]
   281     add         ecx,        edx
   282     movzx       edx,        byte [rsi+rdi]
   283     lea         edx,        [ecx+edx+4]
   285     ; add up
   286     shr         edx,        3
   287     movd        mm1,        edx
   288     pshufw      mm1,        mm1, 0x0
   289     packuswb    mm1,        mm1
   291     ; write out
   292     mov         rdi,        arg(0) ;dst;
   293     movsxd      rcx,        dword ptr arg(1) ;dst_stride
   294     lea         rax,        [rcx*3]
   296     movq [rdi      ],       mm1
   297     movq [rdi+rcx  ],       mm1
   298     movq [rdi+rcx*2],       mm1
   299     movq [rdi+rax  ],       mm1
   300     lea         rdi,        [rdi+rcx*4]
   301     movq [rdi      ],       mm1
   302     movq [rdi+rcx  ],       mm1
   303     movq [rdi+rcx*2],       mm1
   304     movq [rdi+rax  ],       mm1
   306     ; begin epilog
   307     pop         rdi
   308     pop         rsi
   309     UNSHADOW_ARGS
   310     pop         rbp
   311     ret
   313 ;void vp8_intra_pred_uv_dc128_mmx(
   314 ;    unsigned char *dst,
   315 ;    int dst_stride
   316 ;    unsigned char *above,
   317 ;    unsigned char *left,
   318 ;    int left_stride,
   319 ;    )
   320 global sym(vp8_intra_pred_uv_dc128_mmx) PRIVATE
   321 sym(vp8_intra_pred_uv_dc128_mmx):
   322     push        rbp
   323     mov         rbp, rsp
   324     SHADOW_ARGS_TO_STACK 5
   325     GET_GOT     rbx
   326     ; end prolog
   328     ;arg(2), arg(3), arg(4) not used
   330     ; write out
   331     movq        mm1,        [GLOBAL(dc_128)]
   332     mov         rax,        arg(0) ;dst;
   333     movsxd      rdx,        dword ptr arg(1) ;dst_stride
   334     lea         rcx,        [rdx*3]
   336     movq [rax      ],       mm1
   337     movq [rax+rdx  ],       mm1
   338     movq [rax+rdx*2],       mm1
   339     movq [rax+rcx  ],       mm1
   340     lea         rax,        [rax+rdx*4]
   341     movq [rax      ],       mm1
   342     movq [rax+rdx  ],       mm1
   343     movq [rax+rdx*2],       mm1
   344     movq [rax+rcx  ],       mm1
   346     ; begin epilog
   347     RESTORE_GOT
   348     UNSHADOW_ARGS
   349     pop         rbp
   350     ret
   352 ;void vp8_intra_pred_uv_tm_sse2(
   353 ;    unsigned char *dst,
   354 ;    int dst_stride
   355 ;    unsigned char *above,
   356 ;    unsigned char *left,
   357 ;    int left_stride,
   358 ;    )
   359 %macro vp8_intra_pred_uv_tm 1
   360 global sym(vp8_intra_pred_uv_tm_%1) PRIVATE
   361 sym(vp8_intra_pred_uv_tm_%1):
   362     push        rbp
   363     mov         rbp, rsp
   364     SHADOW_ARGS_TO_STACK 5
   365     GET_GOT     rbx
   366     push        rsi
   367     push        rdi
   368     ; end prolog
   370     ; read top row
   371     mov         edx,        4
   372     mov         rsi,        arg(2) ;above
   373     movsxd      rax,        dword ptr arg(4) ;left_stride;
   374     pxor        xmm0,       xmm0
   375 %ifidn %1, ssse3
   376     movdqa      xmm2,       [GLOBAL(dc_1024)]
   377 %endif
   378     movq        xmm1,       [rsi]
   379     punpcklbw   xmm1,       xmm0
   381     ; set up left ptrs ans subtract topleft
   382     movd        xmm3,       [rsi-1]
   383     mov         rsi,        arg(3) ;left;
   384 %ifidn %1, sse2
   385     punpcklbw   xmm3,       xmm0
   386     pshuflw     xmm3,       xmm3, 0x0
   387     punpcklqdq  xmm3,       xmm3
   388 %else
   389     pshufb      xmm3,       xmm2
   390 %endif
   391     psubw       xmm1,       xmm3
   393     ; set up dest ptrs
   394     mov         rdi,        arg(0) ;dst;
   395     movsxd      rcx,        dword ptr arg(1) ;dst_stride
   397 .vp8_intra_pred_uv_tm_%1_loop:
   398     movd        xmm3,       [rsi]
   399     movd        xmm5,       [rsi+rax]
   400 %ifidn %1, sse2
   401     punpcklbw   xmm3,       xmm0
   402     punpcklbw   xmm5,       xmm0
   403     pshuflw     xmm3,       xmm3, 0x0
   404     pshuflw     xmm5,       xmm5, 0x0
   405     punpcklqdq  xmm3,       xmm3
   406     punpcklqdq  xmm5,       xmm5
   407 %else
   408     pshufb      xmm3,       xmm2
   409     pshufb      xmm5,       xmm2
   410 %endif
   411     paddw       xmm3,       xmm1
   412     paddw       xmm5,       xmm1
   413     packuswb    xmm3,       xmm5
   414     movq  [rdi    ],        xmm3
   415     movhps[rdi+rcx],        xmm3
   416     lea         rsi,        [rsi+rax*2]
   417     lea         rdi,        [rdi+rcx*2]
   418     dec         edx
   419     jnz .vp8_intra_pred_uv_tm_%1_loop
   421     ; begin epilog
   422     pop         rdi
   423     pop         rsi
   424     RESTORE_GOT
   425     UNSHADOW_ARGS
   426     pop         rbp
   427     ret
   428 %endmacro
   430 vp8_intra_pred_uv_tm sse2
   431 vp8_intra_pred_uv_tm ssse3
   433 ;void vp8_intra_pred_uv_ve_mmx(
   434 ;    unsigned char *dst,
   435 ;    int dst_stride
   436 ;    unsigned char *above,
   437 ;    unsigned char *left,
   438 ;    int left_stride,
   439 ;    )
   440 global sym(vp8_intra_pred_uv_ve_mmx) PRIVATE
   441 sym(vp8_intra_pred_uv_ve_mmx):
   442     push        rbp
   443     mov         rbp, rsp
   444     SHADOW_ARGS_TO_STACK 5
   445     ; end prolog
   447     ; arg(3), arg(4) not used
   449     ; read from top
   450     mov         rax,        arg(2) ;src;
   452     movq        mm1,        [rax]
   454     ; write out
   455     mov         rax,        arg(0) ;dst;
   456     movsxd      rdx,        dword ptr arg(1) ;dst_stride
   457     lea         rcx,        [rdx*3]
   459     movq [rax      ],       mm1
   460     movq [rax+rdx  ],       mm1
   461     movq [rax+rdx*2],       mm1
   462     movq [rax+rcx  ],       mm1
   463     lea         rax,        [rax+rdx*4]
   464     movq [rax      ],       mm1
   465     movq [rax+rdx  ],       mm1
   466     movq [rax+rdx*2],       mm1
   467     movq [rax+rcx  ],       mm1
   469     ; begin epilog
   470     UNSHADOW_ARGS
   471     pop         rbp
   472     ret
   474 ;void vp8_intra_pred_uv_ho_mmx2(
   475 ;    unsigned char *dst,
   476 ;    int dst_stride
   477 ;    unsigned char *above,
   478 ;    unsigned char *left,
   479 ;    int left_stride
   480 ;    )
   481 %macro vp8_intra_pred_uv_ho 1
   482 global sym(vp8_intra_pred_uv_ho_%1) PRIVATE
   483 sym(vp8_intra_pred_uv_ho_%1):
   484     push        rbp
   485     mov         rbp, rsp
   486     SHADOW_ARGS_TO_STACK 5
   487     push        rsi
   488     push        rdi
   489 %ifidn %1, ssse3
   490 %ifndef GET_GOT_SAVE_ARG
   491     push        rbx
   492 %endif
   493     GET_GOT     rbx
   494 %endif
   495     ; end prolog
   497     ;arg(2) not used
   499     ; read from left and write out
   500 %ifidn %1, mmx2
   501     mov         edx,        4
   502 %endif
   503     mov         rsi,        arg(3) ;left
   504     movsxd      rax,        dword ptr arg(4) ;left_stride;
   505     mov         rdi,        arg(0) ;dst;
   506     movsxd      rcx,        dword ptr arg(1) ;dst_stride
   507 %ifidn %1, ssse3
   508     lea         rdx,        [rcx*3]
   509     movdqa      xmm2,       [GLOBAL(dc_00001111)]
   510     lea         rbx,        [rax*3]
   511 %endif
   513 %ifidn %1, mmx2
   514 .vp8_intra_pred_uv_ho_%1_loop:
   515     movd        mm0,        [rsi]
   516     movd        mm1,        [rsi+rax]
   517     punpcklbw   mm0,        mm0
   518     punpcklbw   mm1,        mm1
   519     pshufw      mm0,        mm0, 0x0
   520     pshufw      mm1,        mm1, 0x0
   521     movq  [rdi    ],        mm0
   522     movq  [rdi+rcx],        mm1
   523     lea         rsi,        [rsi+rax*2]
   524     lea         rdi,        [rdi+rcx*2]
   525     dec         edx
   526     jnz .vp8_intra_pred_uv_ho_%1_loop
   527 %else
   528     movd        xmm0,       [rsi]
   529     movd        xmm3,       [rsi+rax]
   530     movd        xmm1,       [rsi+rax*2]
   531     movd        xmm4,       [rsi+rbx]
   532     punpcklbw   xmm0,       xmm3
   533     punpcklbw   xmm1,       xmm4
   534     pshufb      xmm0,       xmm2
   535     pshufb      xmm1,       xmm2
   536     movq   [rdi    ],       xmm0
   537     movhps [rdi+rcx],       xmm0
   538     movq [rdi+rcx*2],       xmm1
   539     movhps [rdi+rdx],       xmm1
   540     lea         rsi,        [rsi+rax*4]
   541     lea         rdi,        [rdi+rcx*4]
   542     movd        xmm0,       [rsi]
   543     movd        xmm3,       [rsi+rax]
   544     movd        xmm1,       [rsi+rax*2]
   545     movd        xmm4,       [rsi+rbx]
   546     punpcklbw   xmm0,       xmm3
   547     punpcklbw   xmm1,       xmm4
   548     pshufb      xmm0,       xmm2
   549     pshufb      xmm1,       xmm2
   550     movq   [rdi    ],       xmm0
   551     movhps [rdi+rcx],       xmm0
   552     movq [rdi+rcx*2],       xmm1
   553     movhps [rdi+rdx],       xmm1
   554 %endif
   556     ; begin epilog
   557 %ifidn %1, ssse3
   558     RESTORE_GOT
   559 %ifndef GET_GOT_SAVE_ARG
   560     pop         rbx
   561 %endif
   562 %endif
   563     pop         rdi
   564     pop         rsi
   565     UNSHADOW_ARGS
   566     pop         rbp
   567     ret
   568 %endmacro
   570 vp8_intra_pred_uv_ho mmx2
   571 vp8_intra_pred_uv_ho ssse3
   573 ;void vp8_intra_pred_y_dc_sse2(
   574 ;    unsigned char *dst,
   575 ;    int dst_stride
   576 ;    unsigned char *above,
   577 ;    unsigned char *left,
   578 ;    int left_stride
   579 ;    )
   580 global sym(vp8_intra_pred_y_dc_sse2) PRIVATE
   581 sym(vp8_intra_pred_y_dc_sse2):
   582     push        rbp
   583     mov         rbp, rsp
   584     SHADOW_ARGS_TO_STACK 5
   585     push        rsi
   586     push        rdi
   587     ; end prolog
   589     ; from top
   590     mov         rdi,        arg(2) ;above
   591     mov         rsi,        arg(3) ;left
   592     movsxd      rax,        dword ptr arg(4) ;left_stride;
   594     pxor        xmm0,       xmm0
   595     movdqa      xmm1,       [rdi]
   596     psadbw      xmm1,       xmm0
   597     movq        xmm2,       xmm1
   598     punpckhqdq  xmm1,       xmm1
   599     paddw       xmm1,       xmm2
   601     ; from left
   602     lea         rdi,        [rax*3]
   604     movzx       ecx,        byte [rsi]
   605     movzx       edx,        byte [rsi+rax]
   606     add         ecx,        edx
   607     movzx       edx,        byte [rsi+rax*2]
   608     add         ecx,        edx
   609     movzx       edx,        byte [rsi+rdi]
   610     add         ecx,        edx
   611     lea         rsi,        [rsi+rax*4]
   613     movzx       edx,        byte [rsi]
   614     add         ecx,        edx
   615     movzx       edx,        byte [rsi+rax]
   616     add         ecx,        edx
   617     movzx       edx,        byte [rsi+rax*2]
   618     add         ecx,        edx
   619     movzx       edx,        byte [rsi+rdi]
   620     add         ecx,        edx
   621     lea         rsi,        [rsi+rax*4]
   623     movzx       edx,        byte [rsi]
   624     add         ecx,        edx
   625     movzx       edx,        byte [rsi+rax]
   626     add         ecx,        edx
   627     movzx       edx,        byte [rsi+rax*2]
   628     add         ecx,        edx
   629     movzx       edx,        byte [rsi+rdi]
   630     add         ecx,        edx
   631     lea         rsi,        [rsi+rax*4]
   633     movzx       edx,        byte [rsi]
   634     add         ecx,        edx
   635     movzx       edx,        byte [rsi+rax]
   636     add         ecx,        edx
   637     movzx       edx,        byte [rsi+rax*2]
   638     add         ecx,        edx
   639     movzx       edx,        byte [rsi+rdi]
   640     add         ecx,        edx
   642     ; add up
   643     pextrw      edx,        xmm1, 0x0
   644     lea         edx,        [edx+ecx+16]
   645     sar         edx,        5
   646     movd        xmm1,       edx
   647     ; FIXME use pshufb for ssse3 version
   648     pshuflw     xmm1,       xmm1, 0x0
   649     punpcklqdq  xmm1,       xmm1
   650     packuswb    xmm1,       xmm1
   652     ; write out
   653     mov         rsi,        2
   654     mov         rdi,        arg(0) ;dst;
   655     movsxd      rcx,        dword ptr arg(1) ;dst_stride
   656     lea         rax,        [rcx*3]
   658 .label
   659     movdqa [rdi      ],     xmm1
   660     movdqa [rdi+rcx  ],     xmm1
   661     movdqa [rdi+rcx*2],     xmm1
   662     movdqa [rdi+rax  ],     xmm1
   663     lea         rdi,        [rdi+rcx*4]
   664     movdqa [rdi      ],     xmm1
   665     movdqa [rdi+rcx  ],     xmm1
   666     movdqa [rdi+rcx*2],     xmm1
   667     movdqa [rdi+rax  ],     xmm1
   668     lea         rdi,        [rdi+rcx*4]
   669     dec         rsi
   670     jnz .label
   672     ; begin epilog
   673     pop         rdi
   674     pop         rsi
   675     UNSHADOW_ARGS
   676     pop         rbp
   677     ret
   679 ;void vp8_intra_pred_y_dctop_sse2(
   680 ;    unsigned char *dst,
   681 ;    int dst_stride
   682 ;    unsigned char *above,
   683 ;    unsigned char *left,
   684 ;    int left_stride
   685 ;    )
   686 global sym(vp8_intra_pred_y_dctop_sse2) PRIVATE
   687 sym(vp8_intra_pred_y_dctop_sse2):
   688     push        rbp
   689     mov         rbp, rsp
   690     SHADOW_ARGS_TO_STACK 5
   691     push        rsi
   692     GET_GOT     rbx
   693     ; end prolog
   695     ;arg(3), arg(4) not used
   697     ; from top
   698     mov         rcx,        arg(2) ;above;
   699     pxor        xmm0,       xmm0
   700     movdqa      xmm1,       [rcx]
   701     psadbw      xmm1,       xmm0
   702     movdqa      xmm2,       xmm1
   703     punpckhqdq  xmm1,       xmm1
   704     paddw       xmm1,       xmm2
   706     ; add up
   707     paddw       xmm1,       [GLOBAL(dc_8)]
   708     psraw       xmm1,       4
   709     ; FIXME use pshufb for ssse3 version
   710     pshuflw     xmm1,       xmm1, 0x0
   711     punpcklqdq  xmm1,       xmm1
   712     packuswb    xmm1,       xmm1
   714     ; write out
   715     mov         rsi,        2
   716     mov         rdx,        arg(0) ;dst;
   717     movsxd      rcx,        dword ptr arg(1) ;dst_stride
   718     lea         rax,        [rcx*3]
   720 .label
   721     movdqa [rdx      ],     xmm1
   722     movdqa [rdx+rcx  ],     xmm1
   723     movdqa [rdx+rcx*2],     xmm1
   724     movdqa [rdx+rax  ],     xmm1
   725     lea         rdx,        [rdx+rcx*4]
   726     movdqa [rdx      ],     xmm1
   727     movdqa [rdx+rcx  ],     xmm1
   728     movdqa [rdx+rcx*2],     xmm1
   729     movdqa [rdx+rax  ],     xmm1
   730     lea         rdx,        [rdx+rcx*4]
   731     dec         rsi
   732     jnz .label
   734     ; begin epilog
   735     RESTORE_GOT
   736     pop         rsi
   737     UNSHADOW_ARGS
   738     pop         rbp
   739     ret
   741 ;void vp8_intra_pred_y_dcleft_sse2(
   742 ;    unsigned char *dst,
   743 ;    int dst_stride
   744 ;    unsigned char *above,
   745 ;    unsigned char *left,
   746 ;    int left_stride
   747 ;    )
   748 global sym(vp8_intra_pred_y_dcleft_sse2) PRIVATE
   749 sym(vp8_intra_pred_y_dcleft_sse2):
   750     push        rbp
   751     mov         rbp, rsp
   752     SHADOW_ARGS_TO_STACK 5
   753     push        rsi
   754     push        rdi
   755     ; end prolog
   757     ;arg(2) not used
   759     ; from left
   760     mov         rsi,        arg(3) ;left;
   761     movsxd      rax,        dword ptr arg(4) ;left_stride;
   763     lea         rdi,        [rax*3]
   764     movzx       ecx,        byte [rsi]
   765     movzx       edx,        byte [rsi+rax]
   766     add         ecx,        edx
   767     movzx       edx,        byte [rsi+rax*2]
   768     add         ecx,        edx
   769     movzx       edx,        byte [rsi+rdi]
   770     add         ecx,        edx
   771     lea         rsi,        [rsi+rax*4]
   772     movzx       edx,        byte [rsi]
   773     add         ecx,        edx
   774     movzx       edx,        byte [rsi+rax]
   775     add         ecx,        edx
   776     movzx       edx,        byte [rsi+rax*2]
   777     add         ecx,        edx
   778     movzx       edx,        byte [rsi+rdi]
   779     add         ecx,        edx
   780     lea         rsi,        [rsi+rax*4]
   781     movzx       edx,        byte [rsi]
   782     add         ecx,        edx
   783     movzx       edx,        byte [rsi+rax]
   784     add         ecx,        edx
   785     movzx       edx,        byte [rsi+rax*2]
   786     add         ecx,        edx
   787     movzx       edx,        byte [rsi+rdi]
   788     add         ecx,        edx
   789     lea         rsi,        [rsi+rax*4]
   790     movzx       edx,        byte [rsi]
   791     add         ecx,        edx
   792     movzx       edx,        byte [rsi+rax]
   793     add         ecx,        edx
   794     movzx       edx,        byte [rsi+rax*2]
   795     add         ecx,        edx
   796     movzx       edx,        byte [rsi+rdi]
   797     lea         edx,        [ecx+edx+8]
   799     ; add up
   800     shr         edx,        4
   801     movd        xmm1,       edx
   802     ; FIXME use pshufb for ssse3 version
   803     pshuflw     xmm1,       xmm1, 0x0
   804     punpcklqdq  xmm1,       xmm1
   805     packuswb    xmm1,       xmm1
   807     ; write out
   808     mov         rsi,        2
   809     mov         rdi,        arg(0) ;dst;
   810     movsxd      rcx,        dword ptr arg(1) ;dst_stride
   811     lea         rax,        [rcx*3]
   813 .label
   814     movdqa [rdi      ],     xmm1
   815     movdqa [rdi+rcx  ],     xmm1
   816     movdqa [rdi+rcx*2],     xmm1
   817     movdqa [rdi+rax  ],     xmm1
   818     lea         rdi,        [rdi+rcx*4]
   819     movdqa [rdi      ],     xmm1
   820     movdqa [rdi+rcx  ],     xmm1
   821     movdqa [rdi+rcx*2],     xmm1
   822     movdqa [rdi+rax  ],     xmm1
   823     lea         rdi,        [rdi+rcx*4]
   824     dec         rsi
   825     jnz .label
   827     ; begin epilog
   828     pop         rdi
   829     pop         rsi
   830     UNSHADOW_ARGS
   831     pop         rbp
   832     ret
   834 ;void vp8_intra_pred_y_dc128_sse2(
   835 ;    unsigned char *dst,
   836 ;    int dst_stride
   837 ;    unsigned char *above,
   838 ;    unsigned char *left,
   839 ;    int left_stride
   840 ;    )
   841 global sym(vp8_intra_pred_y_dc128_sse2) PRIVATE
   842 sym(vp8_intra_pred_y_dc128_sse2):
   843     push        rbp
   844     mov         rbp, rsp
   845     SHADOW_ARGS_TO_STACK 5
   846     push        rsi
   847     GET_GOT     rbx
   848     ; end prolog
   850     ;arg(2), arg(3), arg(4) not used
   852     ; write out
   853     mov         rsi,        2
   854     movdqa      xmm1,       [GLOBAL(dc_128)]
   855     mov         rax,        arg(0) ;dst;
   856     movsxd      rdx,        dword ptr arg(1) ;dst_stride
   857     lea         rcx,        [rdx*3]
   859 .label
   860     movdqa [rax      ],     xmm1
   861     movdqa [rax+rdx  ],     xmm1
   862     movdqa [rax+rdx*2],     xmm1
   863     movdqa [rax+rcx  ],     xmm1
   864     lea         rax,        [rax+rdx*4]
   865     movdqa [rax      ],     xmm1
   866     movdqa [rax+rdx  ],     xmm1
   867     movdqa [rax+rdx*2],     xmm1
   868     movdqa [rax+rcx  ],     xmm1
   869     lea         rax,        [rax+rdx*4]
   870     dec         rsi
   871     jnz .label
   873     ; begin epilog
   874     RESTORE_GOT
   875     pop         rsi
   876     UNSHADOW_ARGS
   877     pop         rbp
   878     ret
   880 ;void vp8_intra_pred_y_tm_sse2(
   881 ;    unsigned char *dst,
   882 ;    int dst_stride
   883 ;    unsigned char *above,
   884 ;    unsigned char *left,
   885 ;    int left_stride
   886 ;    )
   887 %macro vp8_intra_pred_y_tm 1
   888 global sym(vp8_intra_pred_y_tm_%1) PRIVATE
   889 sym(vp8_intra_pred_y_tm_%1):
   890     push        rbp
   891     mov         rbp, rsp
   892     SHADOW_ARGS_TO_STACK 5
   893     SAVE_XMM 7
   894     push        rsi
   895     push        rdi
   896     GET_GOT     rbx
   897     ; end prolog
   899     ; read top row
   900     mov         edx,        8
   901     mov         rsi,        arg(2) ;above
   902     movsxd      rax,        dword ptr arg(4) ;left_stride;
   903     pxor        xmm0,       xmm0
   904 %ifidn %1, ssse3
   905     movdqa      xmm3,       [GLOBAL(dc_1024)]
   906 %endif
   907     movdqa      xmm1,       [rsi]
   908     movdqa      xmm2,       xmm1
   909     punpcklbw   xmm1,       xmm0
   910     punpckhbw   xmm2,       xmm0
   912     ; set up left ptrs ans subtract topleft
   913     movd        xmm4,       [rsi-1]
   914     mov         rsi,        arg(3) ;left
   915 %ifidn %1, sse2
   916     punpcklbw   xmm4,       xmm0
   917     pshuflw     xmm4,       xmm4, 0x0
   918     punpcklqdq  xmm4,       xmm4
   919 %else
   920     pshufb      xmm4,       xmm3
   921 %endif
   922     psubw       xmm1,       xmm4
   923     psubw       xmm2,       xmm4
   925     ; set up dest ptrs
   926     mov         rdi,        arg(0) ;dst;
   927     movsxd      rcx,        dword ptr arg(1) ;dst_stride
   928 vp8_intra_pred_y_tm_%1_loop:
   929     movd        xmm4,       [rsi]
   930     movd        xmm5,       [rsi+rax]
   931 %ifidn %1, sse2
   932     punpcklbw   xmm4,       xmm0
   933     punpcklbw   xmm5,       xmm0
   934     pshuflw     xmm4,       xmm4, 0x0
   935     pshuflw     xmm5,       xmm5, 0x0
   936     punpcklqdq  xmm4,       xmm4
   937     punpcklqdq  xmm5,       xmm5
   938 %else
   939     pshufb      xmm4,       xmm3
   940     pshufb      xmm5,       xmm3
   941 %endif
   942     movdqa      xmm6,       xmm4
   943     movdqa      xmm7,       xmm5
   944     paddw       xmm4,       xmm1
   945     paddw       xmm6,       xmm2
   946     paddw       xmm5,       xmm1
   947     paddw       xmm7,       xmm2
   948     packuswb    xmm4,       xmm6
   949     packuswb    xmm5,       xmm7
   950     movdqa [rdi    ],       xmm4
   951     movdqa [rdi+rcx],       xmm5
   952     lea         rsi,        [rsi+rax*2]
   953     lea         rdi,        [rdi+rcx*2]
   954     dec         edx
   955     jnz vp8_intra_pred_y_tm_%1_loop
   957     ; begin epilog
   958     RESTORE_GOT
   959     pop         rdi
   960     pop         rsi
   961     RESTORE_XMM
   962     UNSHADOW_ARGS
   963     pop         rbp
   964     ret
   965 %endmacro
   967 vp8_intra_pred_y_tm sse2
   968 vp8_intra_pred_y_tm ssse3
   970 ;void vp8_intra_pred_y_ve_sse2(
   971 ;    unsigned char *dst,
   972 ;    int dst_stride
   973 ;    unsigned char *above,
   974 ;    unsigned char *left,
   975 ;    int left_stride
   976 ;    )
   977 global sym(vp8_intra_pred_y_ve_sse2) PRIVATE
   978 sym(vp8_intra_pred_y_ve_sse2):
   979     push        rbp
   980     mov         rbp, rsp
   981     SHADOW_ARGS_TO_STACK 5
   982     push        rsi
   983     ; end prolog
   985     ;arg(3), arg(4) not used
   987     mov         rax,        arg(2) ;above;
   988     mov         rsi,        2
   989     movsxd      rdx,        dword ptr arg(1) ;dst_stride
   991     ; read from top
   992     movdqa      xmm1,       [rax]
   994     ; write out
   995     mov         rax,        arg(0) ;dst;
   996     lea         rcx,        [rdx*3]
   998 .label
   999     movdqa [rax      ],     xmm1
  1000     movdqa [rax+rdx  ],     xmm1
  1001     movdqa [rax+rdx*2],     xmm1
  1002     movdqa [rax+rcx  ],     xmm1
  1003     lea         rax,        [rax+rdx*4]
  1004     movdqa [rax      ],     xmm1
  1005     movdqa [rax+rdx  ],     xmm1
  1006     movdqa [rax+rdx*2],     xmm1
  1007     movdqa [rax+rcx  ],     xmm1
  1008     lea         rax,        [rax+rdx*4]
  1009     dec         rsi
  1010     jnz .label
  1012     ; begin epilog
  1013     pop         rsi
  1014     UNSHADOW_ARGS
  1015     pop         rbp
  1016     ret
  1018 ;void vp8_intra_pred_y_ho_sse2(
  1019 ;    unsigned char *dst,
  1020 ;    int dst_stride
  1021 ;    unsigned char *above,
  1022 ;    unsigned char *left,
  1023 ;    int left_stride,
  1024 ;    )
  1025 global sym(vp8_intra_pred_y_ho_sse2) PRIVATE
  1026 sym(vp8_intra_pred_y_ho_sse2):
  1027     push        rbp
  1028     mov         rbp, rsp
  1029     SHADOW_ARGS_TO_STACK 5
  1030     push        rsi
  1031     push        rdi
  1032     ; end prolog
  1034     ;arg(2) not used
  1036     ; read from left and write out
  1037     mov         edx,        8
  1038     mov         rsi,        arg(3) ;left;
  1039     movsxd      rax,        dword ptr arg(4) ;left_stride;
  1040     mov         rdi,        arg(0) ;dst;
  1041     movsxd      rcx,        dword ptr arg(1) ;dst_stride
  1043 vp8_intra_pred_y_ho_sse2_loop:
  1044     movd        xmm0,       [rsi]
  1045     movd        xmm1,       [rsi+rax]
  1046     ; FIXME use pshufb for ssse3 version
  1047     punpcklbw   xmm0,       xmm0
  1048     punpcklbw   xmm1,       xmm1
  1049     pshuflw     xmm0,       xmm0, 0x0
  1050     pshuflw     xmm1,       xmm1, 0x0
  1051     punpcklqdq  xmm0,       xmm0
  1052     punpcklqdq  xmm1,       xmm1
  1053     movdqa [rdi    ],       xmm0
  1054     movdqa [rdi+rcx],       xmm1
  1055     lea         rsi,        [rsi+rax*2]
  1056     lea         rdi,        [rdi+rcx*2]
  1057     dec         edx
  1058     jnz vp8_intra_pred_y_ho_sse2_loop
  1060     ; begin epilog
  1061     pop         rdi
  1062     pop         rsi
  1063     UNSHADOW_ARGS
  1064     pop         rbp
  1065     ret
  1067 SECTION_RODATA
  1068 align 16
  1069 dc_128:
  1070     times 16 db 128
  1071 dc_4:
  1072     times 4 dw 4
  1073 align 16
  1074 dc_8:
  1075     times 8 dw 8
  1076 align 16
  1077 dc_1024:
  1078     times 8 dw 0x400
  1079 align 16
  1080 dc_00001111:
  1081     times 8 db 0
  1082     times 8 db 1

mercurial