media/libvpx/vp8/common/x86/subpixel_ssse3.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 %define BLOCK_HEIGHT_WIDTH 4
    15 %define VP8_FILTER_WEIGHT 128
    16 %define VP8_FILTER_SHIFT  7
    19 ;/************************************************************************************
    20 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
    21 ; input pixel array has output_height rows. This routine assumes that output_height is an
    22 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
    23 ; rows each iteration to take advantage of the 128 bits operations.
    24 ;
    25 ; This is an implementation of some of the SSE optimizations first seen in ffvp8
    26 ;
    27 ;*************************************************************************************/
    28 ;void vp8_filter_block1d8_h6_ssse3
    29 ;(
    30 ;    unsigned char  *src_ptr,
    31 ;    unsigned int    src_pixels_per_line,
    32 ;    unsigned char *output_ptr,
    33 ;    unsigned int    output_pitch,
    34 ;    unsigned int    output_height,
    35 ;    unsigned int    vp8_filter_index
    36 ;)
    37 global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
    38 sym(vp8_filter_block1d8_h6_ssse3):
    39     push        rbp
    40     mov         rbp, rsp
    41     SHADOW_ARGS_TO_STACK 6
    42     SAVE_XMM 7
    43     GET_GOT     rbx
    44     push        rsi
    45     push        rdi
    46     ; end prolog
    48     movsxd      rdx, DWORD PTR arg(5)   ;table index
    49     xor         rsi, rsi
    50     shl         rdx, 4
    52     movdqa      xmm7, [GLOBAL(rd)]
    54     lea         rax, [GLOBAL(k0_k5)]
    55     add         rax, rdx
    56     mov         rdi, arg(2)             ;output_ptr
    58     cmp         esi, DWORD PTR [rax]
    59     je          vp8_filter_block1d8_h4_ssse3
    61     movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
    62     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    63     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
    65     mov         rsi, arg(0)             ;src_ptr
    66     movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    67     movsxd      rcx, dword ptr arg(4)   ;output_height
    69     movsxd      rdx, dword ptr arg(3)   ;output_pitch
    71     sub         rdi, rdx
    72 ;xmm3 free
    73 .filter_block1d8_h6_rowloop_ssse3:
    74     movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
    76     movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
    78     punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
    80     movdqa      xmm1,   xmm0
    81     pmaddubsw   xmm0,   xmm4
    83     movdqa      xmm2,   xmm1
    84     pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
    86     pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
    87     pmaddubsw   xmm1,   xmm5
    89     lea         rdi,    [rdi + rdx]
    90     pmaddubsw   xmm2,   xmm6
    92     lea         rsi,    [rsi + rax]
    93     dec         rcx
    95     paddsw      xmm0,   xmm1
    96     paddsw      xmm2,   xmm7
    98     paddsw      xmm0,   xmm2
   100     psraw       xmm0,   7
   102     packuswb    xmm0,   xmm0
   104     movq        MMWORD Ptr [rdi], xmm0
   105     jnz         .filter_block1d8_h6_rowloop_ssse3
   107     ; begin epilog
   108     pop rdi
   109     pop rsi
   110     RESTORE_GOT
   111     RESTORE_XMM
   112     UNSHADOW_ARGS
   113     pop         rbp
   114     ret
   116 vp8_filter_block1d8_h4_ssse3:
   117     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
   118     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
   120     movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
   121     movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
   123     mov         rsi, arg(0)             ;src_ptr
   125     movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
   126     movsxd      rcx, dword ptr arg(4)   ;output_height
   128     movsxd      rdx, dword ptr arg(3)   ;output_pitch
   130     sub         rdi, rdx
   132 .filter_block1d8_h4_rowloop_ssse3:
   133     movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
   135     movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
   137     punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
   139     movdqa      xmm2,   xmm0
   140     pshufb      xmm0,   xmm3
   142     pshufb      xmm2,   xmm4
   143     pmaddubsw   xmm0,   xmm5
   145     lea         rdi,    [rdi + rdx]
   146     pmaddubsw   xmm2,   xmm6
   148     lea         rsi,    [rsi + rax]
   149     dec         rcx
   151     paddsw      xmm0,   xmm7
   153     paddsw      xmm0,   xmm2
   155     psraw       xmm0,   7
   157     packuswb    xmm0,   xmm0
   159     movq        MMWORD Ptr [rdi], xmm0
   161     jnz         .filter_block1d8_h4_rowloop_ssse3
   163     ; begin epilog
   164     pop rdi
   165     pop rsi
   166     RESTORE_GOT
   167     RESTORE_XMM
   168     UNSHADOW_ARGS
   169     pop         rbp
   170     ret
   171 ;void vp8_filter_block1d16_h6_ssse3
   172 ;(
   173 ;    unsigned char  *src_ptr,
   174 ;    unsigned int    src_pixels_per_line,
   175 ;    unsigned char  *output_ptr,
   176 ;    unsigned int    output_pitch,
   177 ;    unsigned int    output_height,
   178 ;    unsigned int    vp8_filter_index
   179 ;)
   180 global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
   181 sym(vp8_filter_block1d16_h6_ssse3):
   182     push        rbp
   183     mov         rbp, rsp
   184     SHADOW_ARGS_TO_STACK 6
   185     SAVE_XMM 7
   186     GET_GOT     rbx
   187     push        rsi
   188     push        rdi
   189     ; end prolog
   191     movsxd      rdx, DWORD PTR arg(5)           ;table index
   192     xor         rsi, rsi
   193     shl         rdx, 4      ;
   195     lea         rax, [GLOBAL(k0_k5)]
   196     add         rax, rdx
   198     mov         rdi, arg(2)                     ;output_ptr
   200     mov         rsi, arg(0)                     ;src_ptr
   202     movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
   203     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
   204     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
   206     movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
   207     movsxd      rcx, dword ptr arg(4)           ;output_height
   208     movsxd      rdx, dword ptr arg(3)           ;output_pitch
   210 .filter_block1d16_h6_rowloop_ssse3:
   211     movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
   213     movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
   215     punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
   217     movdqa      xmm1,   xmm0
   218     pmaddubsw   xmm0,   xmm4
   220     movdqa      xmm2,   xmm1
   221     pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
   223     pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
   224     movq        xmm3,   MMWORD PTR [rsi +  6]
   226     pmaddubsw   xmm1,   xmm5
   227     movq        xmm7,   MMWORD PTR [rsi + 11]
   229     pmaddubsw   xmm2,   xmm6
   230     punpcklbw   xmm3,   xmm7
   232     paddsw      xmm0,   xmm1
   233     movdqa      xmm1,   xmm3
   235     pmaddubsw   xmm3,   xmm4
   236     paddsw      xmm0,   xmm2
   238     movdqa      xmm2,   xmm1
   239     paddsw      xmm0,   [GLOBAL(rd)]
   241     pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
   242     pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
   244     psraw       xmm0,   7
   245     pmaddubsw   xmm1,   xmm5
   247     pmaddubsw   xmm2,   xmm6
   248     packuswb    xmm0,   xmm0
   250     lea         rsi,    [rsi + rax]
   251     paddsw      xmm3,   xmm1
   253     paddsw      xmm3,   xmm2
   255     paddsw      xmm3,   [GLOBAL(rd)]
   257     psraw       xmm3,   7
   259     packuswb    xmm3,   xmm3
   261     punpcklqdq  xmm0,   xmm3
   263     movdqa      XMMWORD Ptr [rdi], xmm0
   265     lea         rdi,    [rdi + rdx]
   266     dec         rcx
   267     jnz         .filter_block1d16_h6_rowloop_ssse3
   269     ; begin epilog
   270     pop rdi
   271     pop rsi
   272     RESTORE_GOT
   273     RESTORE_XMM
   274     UNSHADOW_ARGS
   275     pop         rbp
   276     ret
   278 ;void vp8_filter_block1d4_h6_ssse3
   279 ;(
   280 ;    unsigned char  *src_ptr,
   281 ;    unsigned int    src_pixels_per_line,
   282 ;    unsigned char  *output_ptr,
   283 ;    unsigned int    output_pitch,
   284 ;    unsigned int    output_height,
   285 ;    unsigned int    vp8_filter_index
   286 ;)
   287 global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
   288 sym(vp8_filter_block1d4_h6_ssse3):
   289     push        rbp
   290     mov         rbp, rsp
   291     SHADOW_ARGS_TO_STACK 6
   292     SAVE_XMM 7
   293     GET_GOT     rbx
   294     push        rsi
   295     push        rdi
   296     ; end prolog
   298     movsxd      rdx, DWORD PTR arg(5)   ;table index
   299     xor         rsi, rsi
   300     shl         rdx, 4      ;
   302     lea         rax, [GLOBAL(k0_k5)]
   303     add         rax, rdx
   304     movdqa      xmm7, [GLOBAL(rd)]
   306     cmp         esi, DWORD PTR [rax]
   307     je          .vp8_filter_block1d4_h4_ssse3
   309     movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
   310     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
   311     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
   313     mov         rsi, arg(0)             ;src_ptr
   314     mov         rdi, arg(2)             ;output_ptr
   315     movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
   316     movsxd      rcx, dword ptr arg(4)   ;output_height
   318     movsxd      rdx, dword ptr arg(3)   ;output_pitch
   320 ;xmm3 free
   321 .filter_block1d4_h6_rowloop_ssse3:
   322     movdqu      xmm0,   XMMWORD PTR [rsi - 2]
   324     movdqa      xmm1, xmm0
   325     pshufb      xmm0, [GLOBAL(shuf1b)]
   327     movdqa      xmm2, xmm1
   328     pshufb      xmm1, [GLOBAL(shuf2b)]
   329     pmaddubsw   xmm0, xmm4
   330     pshufb      xmm2, [GLOBAL(shuf3b)]
   331     pmaddubsw   xmm1, xmm5
   333 ;--
   334     pmaddubsw   xmm2, xmm6
   336     lea         rsi,    [rsi + rax]
   337 ;--
   338     paddsw      xmm0, xmm1
   339     paddsw      xmm0, xmm7
   340     pxor        xmm1, xmm1
   341     paddsw      xmm0, xmm2
   342     psraw       xmm0, 7
   343     packuswb    xmm0, xmm0
   345     movd        DWORD PTR [rdi], xmm0
   347     add         rdi, rdx
   348     dec         rcx
   349     jnz         .filter_block1d4_h6_rowloop_ssse3
   351     ; begin epilog
   352     pop rdi
   353     pop rsi
   354     RESTORE_GOT
   355     RESTORE_XMM
   356     UNSHADOW_ARGS
   357     pop         rbp
   358     ret
   360 .vp8_filter_block1d4_h4_ssse3:
   361     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
   362     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
   363     movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
   364     movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
   366     mov         rsi, arg(0)             ;src_ptr
   367     mov         rdi, arg(2)             ;output_ptr
   368     movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
   369     movsxd      rcx, dword ptr arg(4)   ;output_height
   371     movsxd      rdx, dword ptr arg(3)   ;output_pitch
   373 .filter_block1d4_h4_rowloop_ssse3:
   374     movdqu      xmm1,   XMMWORD PTR [rsi - 2]
   376     movdqa      xmm2, xmm1
   377     pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]
   378     pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]
   379     pmaddubsw   xmm1, xmm5
   381 ;--
   382     pmaddubsw   xmm2, xmm6
   384     lea         rsi,    [rsi + rax]
   385 ;--
   386     paddsw      xmm1, xmm7
   387     paddsw      xmm1, xmm2
   388     psraw       xmm1, 7
   389     packuswb    xmm1, xmm1
   391     movd        DWORD PTR [rdi], xmm1
   393     add         rdi, rdx
   394     dec         rcx
   395     jnz         .filter_block1d4_h4_rowloop_ssse3
   397     ; begin epilog
   398     pop rdi
   399     pop rsi
   400     RESTORE_GOT
   401     RESTORE_XMM
   402     UNSHADOW_ARGS
   403     pop         rbp
   404     ret
   408 ;void vp8_filter_block1d16_v6_ssse3
   409 ;(
   410 ;    unsigned char *src_ptr,
   411 ;    unsigned int   src_pitch,
   412 ;    unsigned char *output_ptr,
   413 ;    unsigned int   out_pitch,
   414 ;    unsigned int   output_height,
   415 ;    unsigned int   vp8_filter_index
   416 ;)
   417 global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
   418 sym(vp8_filter_block1d16_v6_ssse3):
   419     push        rbp
   420     mov         rbp, rsp
   421     SHADOW_ARGS_TO_STACK 6
   422     SAVE_XMM 7
   423     GET_GOT     rbx
   424     push        rsi
   425     push        rdi
   426     ; end prolog
   428     movsxd      rdx, DWORD PTR arg(5)   ;table index
   429     xor         rsi, rsi
   430     shl         rdx, 4      ;
   432     lea         rax, [GLOBAL(k0_k5)]
   433     add         rax, rdx
   435     cmp         esi, DWORD PTR [rax]
   436     je          .vp8_filter_block1d16_v4_ssse3
   438     movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
   439     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
   440     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
   442     mov         rsi, arg(0)             ;src_ptr
   443     movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
   444     mov         rdi, arg(2)             ;output_ptr
   446 %if ABI_IS_32BIT=0
   447     movsxd      r8, DWORD PTR arg(3)    ;out_pitch
   448 %endif
   449     mov         rax, rsi
   450     movsxd      rcx, DWORD PTR arg(4)   ;output_height
   451     add         rax, rdx
   454 .vp8_filter_block1d16_v6_ssse3_loop:
   455     movq        xmm1, MMWORD PTR [rsi]                  ;A
   456     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
   457     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
   458     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
   459     movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
   461     punpcklbw   xmm2, xmm4                  ;B D
   462     punpcklbw   xmm3, xmm0                  ;C E
   464     movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
   466     pmaddubsw   xmm3, xmm6
   467     punpcklbw   xmm1, xmm0                  ;A F
   468     pmaddubsw   xmm2, xmm7
   469     pmaddubsw   xmm1, xmm5
   471     paddsw      xmm2, xmm3
   472     paddsw      xmm2, xmm1
   473     paddsw      xmm2, [GLOBAL(rd)]
   474     psraw       xmm2, 7
   475     packuswb    xmm2, xmm2
   477     movq        MMWORD PTR [rdi], xmm2          ;store the results
   479     movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
   480     movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
   481     movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
   482     movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
   483     movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
   485     punpcklbw   xmm2, xmm4                  ;B D
   486     punpcklbw   xmm3, xmm0                  ;C E
   488     movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
   489     pmaddubsw   xmm3, xmm6
   490     punpcklbw   xmm1, xmm0                  ;A F
   491     pmaddubsw   xmm2, xmm7
   492     pmaddubsw   xmm1, xmm5
   494     add         rsi,  rdx
   495     add         rax,  rdx
   496 ;--
   497 ;--
   498     paddsw      xmm2, xmm3
   499     paddsw      xmm2, xmm1
   500     paddsw      xmm2, [GLOBAL(rd)]
   501     psraw       xmm2, 7
   502     packuswb    xmm2, xmm2
   504     movq        MMWORD PTR [rdi+8], xmm2
   506 %if ABI_IS_32BIT
   507     add         rdi,        DWORD PTR arg(3) ;out_pitch
   508 %else
   509     add         rdi,        r8
   510 %endif
   511     dec         rcx
   512     jnz         .vp8_filter_block1d16_v6_ssse3_loop
   514     ; begin epilog
   515     pop rdi
   516     pop rsi
   517     RESTORE_GOT
   518     RESTORE_XMM
   519     UNSHADOW_ARGS
   520     pop         rbp
   521     ret
   523 .vp8_filter_block1d16_v4_ssse3:
   524     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
   525     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
   527     mov         rsi, arg(0)             ;src_ptr
   528     movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
   529     mov         rdi, arg(2)             ;output_ptr
   531 %if ABI_IS_32BIT=0
   532     movsxd      r8, DWORD PTR arg(3)    ;out_pitch
   533 %endif
   534     mov         rax, rsi
   535     movsxd      rcx, DWORD PTR arg(4)   ;output_height
   536     add         rax, rdx
   538 .vp8_filter_block1d16_v4_ssse3_loop:
   539     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
   540     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
   541     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
   542     movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
   544     punpcklbw   xmm2, xmm4                  ;B D
   545     punpcklbw   xmm3, xmm0                  ;C E
   547     pmaddubsw   xmm3, xmm6
   548     pmaddubsw   xmm2, xmm7
   549     movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
   550     movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
   551     movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
   552     movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
   554     paddsw      xmm2, [GLOBAL(rd)]
   555     paddsw      xmm2, xmm3
   556     psraw       xmm2, 7
   557     packuswb    xmm2, xmm2
   559     punpcklbw   xmm5, xmm4                  ;B D
   560     punpcklbw   xmm1, xmm0                  ;C E
   562     pmaddubsw   xmm1, xmm6
   563     pmaddubsw   xmm5, xmm7
   565     movdqa      xmm4, [GLOBAL(rd)]
   566     add         rsi,  rdx
   567     add         rax,  rdx
   568 ;--
   569 ;--
   570     paddsw      xmm5, xmm1
   571     paddsw      xmm5, xmm4
   572     psraw       xmm5, 7
   573     packuswb    xmm5, xmm5
   575     punpcklqdq  xmm2, xmm5
   577     movdqa       XMMWORD PTR [rdi], xmm2
   579 %if ABI_IS_32BIT
   580     add         rdi,        DWORD PTR arg(3) ;out_pitch
   581 %else
   582     add         rdi,        r8
   583 %endif
   584     dec         rcx
   585     jnz         .vp8_filter_block1d16_v4_ssse3_loop
   587     ; begin epilog
   588     pop rdi
   589     pop rsi
   590     RESTORE_GOT
   591     RESTORE_XMM
   592     UNSHADOW_ARGS
   593     pop         rbp
   594     ret
   596 ;void vp8_filter_block1d8_v6_ssse3
   597 ;(
   598 ;    unsigned char *src_ptr,
   599 ;    unsigned int   src_pitch,
   600 ;    unsigned char *output_ptr,
   601 ;    unsigned int   out_pitch,
   602 ;    unsigned int   output_height,
   603 ;    unsigned int   vp8_filter_index
   604 ;)
   605 global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
   606 sym(vp8_filter_block1d8_v6_ssse3):
   607     push        rbp
   608     mov         rbp, rsp
   609     SHADOW_ARGS_TO_STACK 6
   610     SAVE_XMM 7
   611     GET_GOT     rbx
   612     push        rsi
   613     push        rdi
   614     ; end prolog
   616     movsxd      rdx, DWORD PTR arg(5)   ;table index
   617     xor         rsi, rsi
   618     shl         rdx, 4      ;
   620     lea         rax, [GLOBAL(k0_k5)]
   621     add         rax, rdx
   623     movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
   624     mov         rdi, arg(2)             ;output_ptr
   625 %if ABI_IS_32BIT=0
   626     movsxd      r8, DWORD PTR arg(3)    ; out_pitch
   627 %endif
   628     movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
   630     cmp         esi, DWORD PTR [rax]
   631     je          .vp8_filter_block1d8_v4_ssse3
   633     movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
   634     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
   635     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
   637     mov         rsi, arg(0)             ;src_ptr
   639     mov         rax, rsi
   640     add         rax, rdx
   642 .vp8_filter_block1d8_v6_ssse3_loop:
   643     movq        xmm1, MMWORD PTR [rsi]                  ;A
   644     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
   645     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
   646     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
   647     movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
   649     punpcklbw   xmm2, xmm4                  ;B D
   650     punpcklbw   xmm3, xmm0                  ;C E
   652     movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
   653     movdqa      xmm4, [GLOBAL(rd)]
   655     pmaddubsw   xmm3, xmm6
   656     punpcklbw   xmm1, xmm0                  ;A F
   657     pmaddubsw   xmm2, xmm7
   658     pmaddubsw   xmm1, xmm5
   659     add         rsi,  rdx
   660     add         rax,  rdx
   661 ;--
   662 ;--
   663     paddsw      xmm2, xmm3
   664     paddsw      xmm2, xmm1
   665     paddsw      xmm2, xmm4
   666     psraw       xmm2, 7
   667     packuswb    xmm2, xmm2
   669     movq        MMWORD PTR [rdi], xmm2
   671 %if ABI_IS_32BIT
   672     add         rdi,        DWORD PTR arg(3) ;[out_pitch]
   673 %else
   674     add         rdi,        r8
   675 %endif
   676     dec         rcx
   677     jnz         .vp8_filter_block1d8_v6_ssse3_loop
   679     ; begin epilog
   680     pop rdi
   681     pop rsi
   682     RESTORE_GOT
   683     RESTORE_XMM
   684     UNSHADOW_ARGS
   685     pop         rbp
   686     ret
   688 .vp8_filter_block1d8_v4_ssse3:
   689     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
   690     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
   691     movdqa      xmm5, [GLOBAL(rd)]
   693     mov         rsi, arg(0)             ;src_ptr
   695     mov         rax, rsi
   696     add         rax, rdx
   698 .vp8_filter_block1d8_v4_ssse3_loop:
   699     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
   700     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
   701     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
   702     movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
   704     punpcklbw   xmm2, xmm4                  ;B D
   705     punpcklbw   xmm3, xmm0                  ;C E
   707     pmaddubsw   xmm3, xmm6
   708     pmaddubsw   xmm2, xmm7
   709     add         rsi,  rdx
   710     add         rax,  rdx
   711 ;--
   712 ;--
   713     paddsw      xmm2, xmm3
   714     paddsw      xmm2, xmm5
   715     psraw       xmm2, 7
   716     packuswb    xmm2, xmm2
   718     movq        MMWORD PTR [rdi], xmm2
   720 %if ABI_IS_32BIT
   721     add         rdi,        DWORD PTR arg(3) ;[out_pitch]
   722 %else
   723     add         rdi,        r8
   724 %endif
   725     dec         rcx
   726     jnz         .vp8_filter_block1d8_v4_ssse3_loop
   728     ; begin epilog
   729     pop rdi
   730     pop rsi
   731     RESTORE_GOT
   732     RESTORE_XMM
   733     UNSHADOW_ARGS
   734     pop         rbp
   735     ret
   736 ;void vp8_filter_block1d4_v6_ssse3
   737 ;(
   738 ;    unsigned char *src_ptr,
   739 ;    unsigned int   src_pitch,
   740 ;    unsigned char *output_ptr,
   741 ;    unsigned int   out_pitch,
   742 ;    unsigned int   output_height,
   743 ;    unsigned int   vp8_filter_index
   744 ;)
   745 global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
   746 sym(vp8_filter_block1d4_v6_ssse3):
   747     push        rbp
   748     mov         rbp, rsp
   749     SHADOW_ARGS_TO_STACK 6
   750     GET_GOT     rbx
   751     push        rsi
   752     push        rdi
   753     ; end prolog
   755     movsxd      rdx, DWORD PTR arg(5)   ;table index
   756     xor         rsi, rsi
   757     shl         rdx, 4      ;
   759     lea         rax, [GLOBAL(k0_k5)]
   760     add         rax, rdx
   762     movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
   763     mov         rdi, arg(2)             ;output_ptr
   764 %if ABI_IS_32BIT=0
   765     movsxd      r8, DWORD PTR arg(3)    ; out_pitch
   766 %endif
   767     movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
   769     cmp         esi, DWORD PTR [rax]
   770     je          .vp8_filter_block1d4_v4_ssse3
   772     movq        mm5, MMWORD PTR [rax]         ;k0_k5
   773     movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
   774     movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
   776     mov         rsi, arg(0)             ;src_ptr
   778     mov         rax, rsi
   779     add         rax, rdx
   781 .vp8_filter_block1d4_v6_ssse3_loop:
   782     movd        mm1, DWORD PTR [rsi]                  ;A
   783     movd        mm2, DWORD PTR [rsi + rdx]            ;B
   784     movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
   785     movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
   786     movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
   788     punpcklbw   mm2, mm4                  ;B D
   789     punpcklbw   mm3, mm0                  ;C E
   791     movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
   793     movq        mm4, [GLOBAL(rd)]
   795     pmaddubsw   mm3, mm6
   796     punpcklbw   mm1, mm0                  ;A F
   797     pmaddubsw   mm2, mm7
   798     pmaddubsw   mm1, mm5
   799     add         rsi,  rdx
   800     add         rax,  rdx
   801 ;--
   802 ;--
   803     paddsw      mm2, mm3
   804     paddsw      mm2, mm1
   805     paddsw      mm2, mm4
   806     psraw       mm2, 7
   807     packuswb    mm2, mm2
   809     movd        DWORD PTR [rdi], mm2
   811 %if ABI_IS_32BIT
   812     add         rdi,        DWORD PTR arg(3) ;[out_pitch]
   813 %else
   814     add         rdi,        r8
   815 %endif
   816     dec         rcx
   817     jnz         .vp8_filter_block1d4_v6_ssse3_loop
   819     ; begin epilog
   820     pop rdi
   821     pop rsi
   822     RESTORE_GOT
   823     UNSHADOW_ARGS
   824     pop         rbp
   825     ret
   827 .vp8_filter_block1d4_v4_ssse3:
   828     movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
   829     movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
   830     movq        mm5, MMWORD PTR [GLOBAL(rd)]
   832     mov         rsi, arg(0)             ;src_ptr
   834     mov         rax, rsi
   835     add         rax, rdx
   837 .vp8_filter_block1d4_v4_ssse3_loop:
   838     movd        mm2, DWORD PTR [rsi + rdx]            ;B
   839     movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
   840     movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
   841     movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
   843     punpcklbw   mm2, mm4                  ;B D
   844     punpcklbw   mm3, mm0                  ;C E
   846     pmaddubsw   mm3, mm6
   847     pmaddubsw   mm2, mm7
   848     add         rsi,  rdx
   849     add         rax,  rdx
   850 ;--
   851 ;--
   852     paddsw      mm2, mm3
   853     paddsw      mm2, mm5
   854     psraw       mm2, 7
   855     packuswb    mm2, mm2
   857     movd        DWORD PTR [rdi], mm2
   859 %if ABI_IS_32BIT
   860     add         rdi,        DWORD PTR arg(3) ;[out_pitch]
   861 %else
   862     add         rdi,        r8
   863 %endif
   864     dec         rcx
   865     jnz         .vp8_filter_block1d4_v4_ssse3_loop
   867     ; begin epilog
   868     pop rdi
   869     pop rsi
   870     RESTORE_GOT
   871     UNSHADOW_ARGS
   872     pop         rbp
   873     ret
   875 ;void vp8_bilinear_predict16x16_ssse3
   876 ;(
   877 ;    unsigned char  *src_ptr,
   878 ;    int   src_pixels_per_line,
   879 ;    int  xoffset,
   880 ;    int  yoffset,
   881 ;    unsigned char *dst_ptr,
   882 ;    int dst_pitch
   883 ;)
   884 global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
   885 sym(vp8_bilinear_predict16x16_ssse3):
   886     push        rbp
   887     mov         rbp, rsp
   888     SHADOW_ARGS_TO_STACK 6
   889     SAVE_XMM 7
   890     GET_GOT     rbx
   891     push        rsi
   892     push        rdi
   893     ; end prolog
   895         lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
   896         movsxd      rax,        dword ptr arg(2)    ; xoffset
   898         cmp         rax,        0                   ; skip first_pass filter if xoffset=0
   899         je          .b16x16_sp_only
   901         shl         rax,        4
   902         lea         rax,        [rax + rcx]         ; HFilter
   904         mov         rdi,        arg(4)              ; dst_ptr
   905         mov         rsi,        arg(0)              ; src_ptr
   906         movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
   908         movdqa      xmm1,       [rax]
   910         movsxd      rax,        dword ptr arg(3)    ; yoffset
   912         cmp         rax,        0                   ; skip second_pass filter if yoffset=0
   913         je          .b16x16_fp_only
   915         shl         rax,        4
   916         lea         rax,        [rax + rcx]         ; VFilter
   918         lea         rcx,        [rdi+rdx*8]
   919         lea         rcx,        [rcx+rdx*8]
   920         movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line
   922         movdqa      xmm2,       [rax]
   924 %if ABI_IS_32BIT=0
   925         movsxd      r8,         dword ptr arg(5)    ; dst_pitch
   926 %endif
   927         movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
   928         movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
   930         punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
   931         movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
   933         movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
   935         lea         rsi,        [rsi + rdx]         ; next line
   937         pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
   939         punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
   940         pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
   942         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   943         psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
   945         paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
   946         psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
   948         movdqa      xmm7,       xmm3
   949         packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
   951 .next_row:
   952         movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
   953         movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
   955         punpcklbw   xmm6,       xmm5
   956         movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
   958         movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
   959         lea         rsi,        [rsi + rdx]         ; next line
   961         pmaddubsw   xmm6,       xmm1
   963         punpcklbw   xmm4,       xmm5
   964         pmaddubsw   xmm4,       xmm1
   966         paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
   967         psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
   969         paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
   970         psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
   972         packuswb    xmm6,       xmm4
   973         movdqa      xmm5,       xmm7
   975         punpcklbw   xmm5,       xmm6
   976         pmaddubsw   xmm5,       xmm2
   978         punpckhbw   xmm7,       xmm6
   979         pmaddubsw   xmm7,       xmm2
   981         paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value
   982         psraw       xmm5,       VP8_FILTER_SHIFT    ; xmm5 /= 128
   984         paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
   985         psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
   987         packuswb    xmm5,       xmm7
   988         movdqa      xmm7,       xmm6
   990         movdqa      [rdi],      xmm5                ; store the results in the destination
   991 %if ABI_IS_32BIT
   992         add         rdi,        DWORD PTR arg(5)    ; dst_pitch
   993 %else
   994         add         rdi,        r8
   995 %endif
   997         cmp         rdi,        rcx
   998         jne         .next_row
  1000         jmp         .done
  1002 .b16x16_sp_only:
  1003         movsxd      rax,        dword ptr arg(3)    ; yoffset
  1004         shl         rax,        4
  1005         lea         rax,        [rax + rcx]         ; VFilter
  1007         mov         rdi,        arg(4)              ; dst_ptr
  1008         mov         rsi,        arg(0)              ; src_ptr
  1009         movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
  1011         movdqa      xmm1,       [rax]               ; VFilter
  1013         lea         rcx,        [rdi+rdx*8]
  1014         lea         rcx,        [rcx+rdx*8]
  1015         movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
  1017         ; get the first horizontal line done
  1018         movq        xmm4,       [rsi]               ; load row 0
  1019         movq        xmm2,       [rsi + 8]           ; load row 0
  1021         lea         rsi,        [rsi + rax]         ; next line
  1022 .next_row_sp:
  1023         movq        xmm3,       [rsi]               ; load row + 1
  1024         movq        xmm5,       [rsi + 8]           ; load row + 1
  1026         punpcklbw   xmm4,       xmm3
  1027         punpcklbw   xmm2,       xmm5
  1029         pmaddubsw   xmm4,       xmm1
  1030         movq        xmm7,       [rsi + rax]         ; load row + 2
  1032         pmaddubsw   xmm2,       xmm1
  1033         movq        xmm6,       [rsi + rax + 8]     ; load row + 2
  1035         punpcklbw   xmm3,       xmm7
  1036         punpcklbw   xmm5,       xmm6
  1038         pmaddubsw   xmm3,       xmm1
  1039         paddw       xmm4,       [GLOBAL(rd)]
  1041         pmaddubsw   xmm5,       xmm1
  1042         paddw       xmm2,       [GLOBAL(rd)]
  1044         psraw       xmm4,       VP8_FILTER_SHIFT
  1045         psraw       xmm2,       VP8_FILTER_SHIFT
  1047         packuswb    xmm4,       xmm2
  1048         paddw       xmm3,       [GLOBAL(rd)]
  1050         movdqa      [rdi],      xmm4                ; store row 0
  1051         paddw       xmm5,       [GLOBAL(rd)]
  1053         psraw       xmm3,       VP8_FILTER_SHIFT
  1054         psraw       xmm5,       VP8_FILTER_SHIFT
  1056         packuswb    xmm3,       xmm5
  1057         movdqa      xmm4,       xmm7
  1059         movdqa      [rdi + rdx],xmm3                ; store row 1
  1060         lea         rsi,        [rsi + 2*rax]
  1062         movdqa      xmm2,       xmm6
  1063         lea         rdi,        [rdi + 2*rdx]
  1065         cmp         rdi,        rcx
  1066         jne         .next_row_sp
  1068         jmp         .done
  1070 .b16x16_fp_only:
  1071         lea         rcx,        [rdi+rdx*8]
  1072         lea         rcx,        [rcx+rdx*8]
  1073         movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
  1075 .next_row_fp:
  1076         movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
  1077         movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
  1079         punpcklbw   xmm2,       xmm4
  1080         movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
  1082         pmaddubsw   xmm2,       xmm1
  1083         movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
  1085         lea         rsi,        [rsi + rax]         ; next line
  1086         punpcklbw   xmm3,       xmm4
  1088         pmaddubsw   xmm3,       xmm1
  1089         movq        xmm5,       [rsi]
  1091         paddw       xmm2,       [GLOBAL(rd)]
  1092         movq        xmm7,       [rsi+1]
  1094         movq        xmm6,       [rsi+8]
  1095         psraw       xmm2,       VP8_FILTER_SHIFT
  1097         punpcklbw   xmm5,       xmm7
  1098         movq        xmm7,       [rsi+9]
  1100         paddw       xmm3,       [GLOBAL(rd)]
  1101         pmaddubsw   xmm5,       xmm1
  1103         psraw       xmm3,       VP8_FILTER_SHIFT
  1104         punpcklbw   xmm6,       xmm7
  1106         packuswb    xmm2,       xmm3
  1107         pmaddubsw   xmm6,       xmm1
  1109         movdqa      [rdi],      xmm2                ; store the results in the destination
  1110         paddw       xmm5,       [GLOBAL(rd)]
  1112         lea         rdi,        [rdi + rdx]         ; dst_pitch
  1113         psraw       xmm5,       VP8_FILTER_SHIFT
  1115         paddw       xmm6,       [GLOBAL(rd)]
  1116         psraw       xmm6,       VP8_FILTER_SHIFT
  1118         packuswb    xmm5,       xmm6
  1119         lea         rsi,        [rsi + rax]         ; next line
  1121         movdqa      [rdi],      xmm5                ; store the results in the destination
  1122         lea         rdi,        [rdi + rdx]         ; dst_pitch
  1124         cmp         rdi,        rcx
  1126         jne         .next_row_fp
  1128 .done:
  1129     ; begin epilog
  1130     pop         rdi
  1131     pop         rsi
  1132     RESTORE_GOT
  1133     RESTORE_XMM
  1134     UNSHADOW_ARGS
  1135     pop         rbp
  1136     ret
  1138 ;void vp8_bilinear_predict8x8_ssse3
  1139 ;(
  1140 ;    unsigned char  *src_ptr,
  1141 ;    int   src_pixels_per_line,
  1142 ;    int  xoffset,
  1143 ;    int  yoffset,
  1144 ;    unsigned char *dst_ptr,
  1145 ;    int dst_pitch
  1146 ;)
  1147 global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
  1148 sym(vp8_bilinear_predict8x8_ssse3):
  1149     push        rbp
  1150     mov         rbp, rsp
  1151     SHADOW_ARGS_TO_STACK 6
  1152     SAVE_XMM 7
  1153     GET_GOT     rbx
  1154     push        rsi
  1155     push        rdi
  1156     ; end prolog
  1158     ALIGN_STACK 16, rax
  1159     sub         rsp, 144                         ; reserve 144 bytes
  1161         lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
  1163         mov         rsi,        arg(0) ;src_ptr
  1164         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
  1166     ;Read 9-line unaligned data in and put them on stack. This gives a big
  1167     ;performance boost.
  1168         movdqu      xmm0,       [rsi]
  1169         lea         rax,        [rdx + rdx*2]
  1170         movdqu      xmm1,       [rsi+rdx]
  1171         movdqu      xmm2,       [rsi+rdx*2]
  1172         add         rsi,        rax
  1173         movdqu      xmm3,       [rsi]
  1174         movdqu      xmm4,       [rsi+rdx]
  1175         movdqu      xmm5,       [rsi+rdx*2]
  1176         add         rsi,        rax
  1177         movdqu      xmm6,       [rsi]
  1178         movdqu      xmm7,       [rsi+rdx]
  1180         movdqa      XMMWORD PTR [rsp],            xmm0
  1182         movdqu      xmm0,       [rsi+rdx*2]
  1184         movdqa      XMMWORD PTR [rsp+16],         xmm1
  1185         movdqa      XMMWORD PTR [rsp+32],         xmm2
  1186         movdqa      XMMWORD PTR [rsp+48],         xmm3
  1187         movdqa      XMMWORD PTR [rsp+64],         xmm4
  1188         movdqa      XMMWORD PTR [rsp+80],         xmm5
  1189         movdqa      XMMWORD PTR [rsp+96],         xmm6
  1190         movdqa      XMMWORD PTR [rsp+112],        xmm7
  1191         movdqa      XMMWORD PTR [rsp+128],        xmm0
  1193         movsxd      rax,        dword ptr arg(2)    ; xoffset
  1194         cmp         rax,        0                   ; skip first_pass filter if xoffset=0
  1195         je          .b8x8_sp_only
  1197         shl         rax,        4
  1198         add         rax,        rcx                 ; HFilter
  1200         mov         rdi,        arg(4)              ; dst_ptr
  1201         movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
  1203         movdqa      xmm0,       [rax]
  1205         movsxd      rax,        dword ptr arg(3)    ; yoffset
  1206         cmp         rax,        0                   ; skip second_pass filter if yoffset=0
  1207         je          .b8x8_fp_only
  1209         shl         rax,        4
  1210         lea         rax,        [rax + rcx]         ; VFilter
  1212         lea         rcx,        [rdi+rdx*8]
  1214         movdqa      xmm1,       [rax]
  1216         ; get the first horizontal line done
  1217         movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
  1218         movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
  1220         psrldq      xmm5,       1
  1221         lea         rsp,        [rsp + 16]          ; next line
  1223         punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
  1224         pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14
  1226         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
  1227         psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
  1229         movdqa      xmm7,       xmm3
  1230         packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
  1232 .next_row:
  1233         movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
  1234         lea         rsp,        [rsp + 16]          ; next line
  1236         movdqa      xmm5,       xmm6
  1238         psrldq      xmm5,       1
  1240         punpcklbw   xmm6,       xmm5
  1241         pmaddubsw   xmm6,       xmm0
  1243         paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
  1244         psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
  1246         packuswb    xmm6,       xmm6
  1248         punpcklbw   xmm7,       xmm6
  1249         pmaddubsw   xmm7,       xmm1
  1251         paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
  1252         psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
  1254         packuswb    xmm7,       xmm7
  1256         movq        [rdi],      xmm7                ; store the results in the destination
  1257         lea         rdi,        [rdi + rdx]
  1259         movdqa      xmm7,       xmm6
  1261         cmp         rdi,        rcx
  1262         jne         .next_row
  1264         jmp         .done8x8
  1266 .b8x8_sp_only:
  1267         movsxd      rax,        dword ptr arg(3)    ; yoffset
  1268         shl         rax,        4
  1269         lea         rax,        [rax + rcx]         ; VFilter
  1271         mov         rdi,        arg(4) ;dst_ptr
  1272         movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
  1274         movdqa      xmm0,       [rax]               ; VFilter
  1276         movq        xmm1,       XMMWORD PTR [rsp]
  1277         movq        xmm2,       XMMWORD PTR [rsp+16]
  1279         movq        xmm3,       XMMWORD PTR [rsp+32]
  1280         punpcklbw   xmm1,       xmm2
  1282         movq        xmm4,       XMMWORD PTR [rsp+48]
  1283         punpcklbw   xmm2,       xmm3
  1285         movq        xmm5,       XMMWORD PTR [rsp+64]
  1286         punpcklbw   xmm3,       xmm4
  1288         movq        xmm6,       XMMWORD PTR [rsp+80]
  1289         punpcklbw   xmm4,       xmm5
  1291         movq        xmm7,       XMMWORD PTR [rsp+96]
  1292         punpcklbw   xmm5,       xmm6
  1294         pmaddubsw   xmm1,       xmm0
  1295         pmaddubsw   xmm2,       xmm0
  1297         pmaddubsw   xmm3,       xmm0
  1298         pmaddubsw   xmm4,       xmm0
  1300         pmaddubsw   xmm5,       xmm0
  1301         punpcklbw   xmm6,       xmm7
  1303         pmaddubsw   xmm6,       xmm0
  1304         paddw       xmm1,       [GLOBAL(rd)]
  1306         paddw       xmm2,       [GLOBAL(rd)]
  1307         psraw       xmm1,       VP8_FILTER_SHIFT
  1309         paddw       xmm3,       [GLOBAL(rd)]
  1310         psraw       xmm2,       VP8_FILTER_SHIFT
  1312         paddw       xmm4,       [GLOBAL(rd)]
  1313         psraw       xmm3,       VP8_FILTER_SHIFT
  1315         paddw       xmm5,       [GLOBAL(rd)]
  1316         psraw       xmm4,       VP8_FILTER_SHIFT
  1318         paddw       xmm6,       [GLOBAL(rd)]
  1319         psraw       xmm5,       VP8_FILTER_SHIFT
  1321         psraw       xmm6,       VP8_FILTER_SHIFT
  1322         packuswb    xmm1,       xmm1
  1324         packuswb    xmm2,       xmm2
  1325         movq        [rdi],      xmm1
  1327         packuswb    xmm3,       xmm3
  1328         movq        [rdi+rdx],  xmm2
  1330         packuswb    xmm4,       xmm4
  1331         movq        xmm1,       XMMWORD PTR [rsp+112]
  1333         lea         rdi,        [rdi + 2*rdx]
  1334         movq        xmm2,       XMMWORD PTR [rsp+128]
  1336         packuswb    xmm5,       xmm5
  1337         movq        [rdi],      xmm3
  1339         packuswb    xmm6,       xmm6
  1340         movq        [rdi+rdx],  xmm4
  1342         lea         rdi,        [rdi + 2*rdx]
  1343         punpcklbw   xmm7,       xmm1
  1345         movq        [rdi],      xmm5
  1346         pmaddubsw   xmm7,       xmm0
  1348         movq        [rdi+rdx],  xmm6
  1349         punpcklbw   xmm1,       xmm2
  1351         pmaddubsw   xmm1,       xmm0
  1352         paddw       xmm7,       [GLOBAL(rd)]
  1354         psraw       xmm7,       VP8_FILTER_SHIFT
  1355         paddw       xmm1,       [GLOBAL(rd)]
  1357         psraw       xmm1,       VP8_FILTER_SHIFT
  1358         packuswb    xmm7,       xmm7
  1360         packuswb    xmm1,       xmm1
  1361         lea         rdi,        [rdi + 2*rdx]
  1363         movq        [rdi],      xmm7
  1365         movq        [rdi+rdx],  xmm1
  1366         lea         rsp,        [rsp + 144]
  1368         jmp         .done8x8
  1370 .b8x8_fp_only:
  1371         lea         rcx,        [rdi+rdx*8]
  1373 .next_row_fp:
  1374         movdqa      xmm1,       XMMWORD PTR [rsp]
  1375         movdqa      xmm3,       XMMWORD PTR [rsp+16]
  1377         movdqa      xmm2,       xmm1
  1378         movdqa      xmm5,       XMMWORD PTR [rsp+32]
  1380         psrldq      xmm2,       1
  1381         movdqa      xmm7,       XMMWORD PTR [rsp+48]
  1383         movdqa      xmm4,       xmm3
  1384         psrldq      xmm4,       1
  1386         movdqa      xmm6,       xmm5
  1387         psrldq      xmm6,       1
  1389         punpcklbw   xmm1,       xmm2
  1390         pmaddubsw   xmm1,       xmm0
  1392         punpcklbw   xmm3,       xmm4
  1393         pmaddubsw   xmm3,       xmm0
  1395         punpcklbw   xmm5,       xmm6
  1396         pmaddubsw   xmm5,       xmm0
  1398         movdqa      xmm2,       xmm7
  1399         psrldq      xmm2,       1
  1401         punpcklbw   xmm7,       xmm2
  1402         pmaddubsw   xmm7,       xmm0
  1404         paddw       xmm1,       [GLOBAL(rd)]
  1405         psraw       xmm1,       VP8_FILTER_SHIFT
  1407         paddw       xmm3,       [GLOBAL(rd)]
  1408         psraw       xmm3,       VP8_FILTER_SHIFT
  1410         paddw       xmm5,       [GLOBAL(rd)]
  1411         psraw       xmm5,       VP8_FILTER_SHIFT
  1413         paddw       xmm7,       [GLOBAL(rd)]
  1414         psraw       xmm7,       VP8_FILTER_SHIFT
  1416         packuswb    xmm1,       xmm1
  1417         packuswb    xmm3,       xmm3
  1419         packuswb    xmm5,       xmm5
  1420         movq        [rdi],      xmm1
  1422         packuswb    xmm7,       xmm7
  1423         movq        [rdi+rdx],  xmm3
  1425         lea         rdi,        [rdi + 2*rdx]
  1426         movq        [rdi],      xmm5
  1428         lea         rsp,        [rsp + 4*16]
  1429         movq        [rdi+rdx],  xmm7
  1431         lea         rdi,        [rdi + 2*rdx]
  1432         cmp         rdi,        rcx
  1434         jne         .next_row_fp
  1436         lea         rsp,        [rsp + 16]
  1438 .done8x8:
  1439     ;add rsp, 144
  1440     pop         rsp
  1441     ; begin epilog
  1442     pop         rdi
  1443     pop         rsi
  1444     RESTORE_GOT
  1445     RESTORE_XMM
  1446     UNSHADOW_ARGS
  1447     pop         rbp
  1448     ret
  1450 SECTION_RODATA
  1451 align 16
  1452 shuf1b:
  1453     db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
  1454 shuf2b:
  1455     db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
  1456 shuf3b:
  1457     db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
  1459 align 16
  1460 shuf2bfrom1:
  1461     db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
  1462 align 16
  1463 shuf3bfrom1:
  1464     db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
  1466 align 16
  1467 rd:
  1468     times 8 dw 0x40
  1470 align 16
  1471 k0_k5:
  1472     times 8 db 0, 0             ;placeholder
  1473     times 8 db 0, 0
  1474     times 8 db 2, 1
  1475     times 8 db 0, 0
  1476     times 8 db 3, 3
  1477     times 8 db 0, 0
  1478     times 8 db 1, 2
  1479     times 8 db 0, 0
  1480 k1_k3:
  1481     times 8 db  0,    0         ;placeholder
  1482     times 8 db  -6,  12
  1483     times 8 db -11,  36
  1484     times 8 db  -9,  50
  1485     times 8 db -16,  77
  1486     times 8 db  -6,  93
  1487     times 8 db  -8, 108
  1488     times 8 db  -1, 123
  1489 k2_k4:
  1490     times 8 db 128,    0        ;placeholder
  1491     times 8 db 123,   -1
  1492     times 8 db 108,   -8
  1493     times 8 db  93,   -6
  1494     times 8 db  77,  -16
  1495     times 8 db  50,   -9
  1496     times 8 db  36,  -11
  1497     times 8 db  12,   -6
  1498 align 16
  1499 vp8_bilinear_filters_ssse3:
  1500     times 8 db 128, 0
  1501     times 8 db 112, 16
  1502     times 8 db 96,  32
  1503     times 8 db 80,  48
  1504     times 8 db 64,  64
  1505     times 8 db 48,  80
  1506     times 8 db 32,  96
  1507     times 8 db 16,  112

mercurial