media/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 %macro VERTx4 1
    15     mov         rdx, arg(5)                 ;filter ptr
    16     mov         rsi, arg(0)                 ;src_ptr
    17     mov         rdi, arg(2)                 ;output_ptr
    18     mov         rcx, 0x0400040
    20     movdqa      xmm4, [rdx]                 ;load filters
    21     movd        xmm5, rcx
    22     packsswb    xmm4, xmm4
    23     pshuflw     xmm0, xmm4, 0b              ;k0_k1
    24     pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
    25     pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
    26     pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
    28     punpcklqdq  xmm0, xmm0
    29     punpcklqdq  xmm1, xmm1
    30     punpcklqdq  xmm2, xmm2
    31     punpcklqdq  xmm3, xmm3
    33     movdqa      k0k1, xmm0
    34     movdqa      k2k3, xmm1
    35     pshufd      xmm5, xmm5, 0
    36     movdqa      k4k5, xmm2
    37     movdqa      k6k7, xmm3
    38     movdqa      krd, xmm5
    40     movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
    42 %if ABI_IS_32BIT=0
    43     movsxd      r8, DWORD PTR arg(3)        ;out_pitch
    44 %endif
    45     mov         rax, rsi
    46     movsxd      rcx, DWORD PTR arg(4)       ;output_height
    47     add         rax, rdx
    49     lea         rbx, [rdx + rdx*4]
    50     add         rbx, rdx                    ;pitch * 6
    52 .loop:
    53     movd        xmm0, [rsi]                 ;A
    54     movd        xmm1, [rsi + rdx]           ;B
    55     movd        xmm2, [rsi + rdx * 2]       ;C
    56     movd        xmm3, [rax + rdx * 2]       ;D
    57     movd        xmm4, [rsi + rdx * 4]       ;E
    58     movd        xmm5, [rax + rdx * 4]       ;F
    60     punpcklbw   xmm0, xmm1                  ;A B
    61     punpcklbw   xmm2, xmm3                  ;C D
    62     punpcklbw   xmm4, xmm5                  ;E F
    64     movd        xmm6, [rsi + rbx]           ;G
    65     movd        xmm7, [rax + rbx]           ;H
    67     pmaddubsw   xmm0, k0k1
    68     pmaddubsw   xmm2, k2k3
    69     punpcklbw   xmm6, xmm7                  ;G H
    70     pmaddubsw   xmm4, k4k5
    71     pmaddubsw   xmm6, k6k7
    73     movdqa      xmm1, xmm2
    74     paddsw      xmm0, xmm6
    75     pmaxsw      xmm2, xmm4
    76     pminsw      xmm4, xmm1
    77     paddsw      xmm0, xmm4
    78     paddsw      xmm0, xmm2
    80     paddsw      xmm0, krd
    81     psraw       xmm0, 7
    82     packuswb    xmm0, xmm0
    84     add         rsi,  rdx
    85     add         rax,  rdx
    86 %if %1
    87     movd        xmm1, [rdi]
    88     pavgb       xmm0, xmm1
    89 %endif
    90     movd        [rdi], xmm0
    92 %if ABI_IS_32BIT
    93     add         rdi, DWORD PTR arg(3)       ;out_pitch
    94 %else
    95     add         rdi, r8
    96 %endif
    97     dec         rcx
    98     jnz         .loop
    99 %endm
   101 %macro VERTx8 1
   102     mov         rdx, arg(5)                 ;filter ptr
   103     mov         rsi, arg(0)                 ;src_ptr
   104     mov         rdi, arg(2)                 ;output_ptr
   105     mov         rcx, 0x0400040
   107     movdqa      xmm4, [rdx]                 ;load filters
   108     movq        xmm5, rcx
   109     packsswb    xmm4, xmm4
   110     pshuflw     xmm0, xmm4, 0b              ;k0_k1
   111     pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
   112     pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
   113     pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
   115     punpcklqdq  xmm0, xmm0
   116     punpcklqdq  xmm1, xmm1
   117     punpcklqdq  xmm2, xmm2
   118     punpcklqdq  xmm3, xmm3
   120     movdqa      k0k1, xmm0
   121     movdqa      k2k3, xmm1
   122     pshufd      xmm5, xmm5, 0
   123     movdqa      k4k5, xmm2
   124     movdqa      k6k7, xmm3
   125     movdqa      krd, xmm5
   127     movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
   129 %if ABI_IS_32BIT=0
   130     movsxd      r8, DWORD PTR arg(3)        ;out_pitch
   131 %endif
   132     mov         rax, rsi
   133     movsxd      rcx, DWORD PTR arg(4)       ;output_height
   134     add         rax, rdx
   136     lea         rbx, [rdx + rdx*4]
   137     add         rbx, rdx                    ;pitch * 6
   139 .loop:
   140     movq        xmm0, [rsi]                 ;A
   141     movq        xmm1, [rsi + rdx]           ;B
   142     movq        xmm2, [rsi + rdx * 2]       ;C
   143     movq        xmm3, [rax + rdx * 2]       ;D
   144     movq        xmm4, [rsi + rdx * 4]       ;E
   145     movq        xmm5, [rax + rdx * 4]       ;F
   147     punpcklbw   xmm0, xmm1                  ;A B
   148     punpcklbw   xmm2, xmm3                  ;C D
   149     punpcklbw   xmm4, xmm5                  ;E F
   151     movq        xmm6, [rsi + rbx]           ;G
   152     movq        xmm7, [rax + rbx]           ;H
   154     pmaddubsw   xmm0, k0k1
   155     pmaddubsw   xmm2, k2k3
   156     punpcklbw   xmm6, xmm7                  ;G H
   157     pmaddubsw   xmm4, k4k5
   158     pmaddubsw   xmm6, k6k7
   160     paddsw      xmm0, xmm6
   161     movdqa      xmm1, xmm2
   162     pmaxsw      xmm2, xmm4
   163     pminsw      xmm4, xmm1
   164     paddsw      xmm0, xmm4
   165     paddsw      xmm0, xmm2
   167     paddsw      xmm0, krd
   168     psraw       xmm0, 7
   169     packuswb    xmm0, xmm0
   171     add         rsi,  rdx
   172     add         rax,  rdx
   173 %if %1
   174     movq        xmm1, [rdi]
   175     pavgb       xmm0, xmm1
   176 %endif
   177     movq        [rdi], xmm0
   179 %if ABI_IS_32BIT
   180     add         rdi, DWORD PTR arg(3)       ;out_pitch
   181 %else
   182     add         rdi, r8
   183 %endif
   184     dec         rcx
   185     jnz         .loop
   186 %endm
   189 %macro VERTx16 1
   190     mov         rdx, arg(5)                 ;filter ptr
   191     mov         rsi, arg(0)                 ;src_ptr
   192     mov         rdi, arg(2)                 ;output_ptr
   193     mov         rcx, 0x0400040
   195     movdqa      xmm4, [rdx]                 ;load filters
   196     movq        xmm5, rcx
   197     packsswb    xmm4, xmm4
   198     pshuflw     xmm0, xmm4, 0b              ;k0_k1
   199     pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
   200     pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
   201     pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
   203     punpcklqdq  xmm0, xmm0
   204     punpcklqdq  xmm1, xmm1
   205     punpcklqdq  xmm2, xmm2
   206     punpcklqdq  xmm3, xmm3
   208     movdqa      k0k1, xmm0
   209     movdqa      k2k3, xmm1
   210     pshufd      xmm5, xmm5, 0
   211     movdqa      k4k5, xmm2
   212     movdqa      k6k7, xmm3
   213     movdqa      krd, xmm5
   215     movsxd      rdx, DWORD PTR arg(1)       ;pixels_per_line
   217 %if ABI_IS_32BIT=0
   218     movsxd      r8, DWORD PTR arg(3)        ;out_pitch
   219 %endif
   220     mov         rax, rsi
   221     movsxd      rcx, DWORD PTR arg(4)       ;output_height
   222     add         rax, rdx
   224     lea         rbx, [rdx + rdx*4]
   225     add         rbx, rdx                    ;pitch * 6
   227 .loop:
   228     movq        xmm0, [rsi]                 ;A
   229     movq        xmm1, [rsi + rdx]           ;B
   230     movq        xmm2, [rsi + rdx * 2]       ;C
   231     movq        xmm3, [rax + rdx * 2]       ;D
   232     movq        xmm4, [rsi + rdx * 4]       ;E
   233     movq        xmm5, [rax + rdx * 4]       ;F
   235     punpcklbw   xmm0, xmm1                  ;A B
   236     punpcklbw   xmm2, xmm3                  ;C D
   237     punpcklbw   xmm4, xmm5                  ;E F
   239     movq        xmm6, [rsi + rbx]           ;G
   240     movq        xmm7, [rax + rbx]           ;H
   242     pmaddubsw   xmm0, k0k1
   243     pmaddubsw   xmm2, k2k3
   244     punpcklbw   xmm6, xmm7                  ;G H
   245     pmaddubsw   xmm4, k4k5
   246     pmaddubsw   xmm6, k6k7
   248     paddsw      xmm0, xmm6
   249     movdqa      xmm1, xmm2
   250     pmaxsw      xmm2, xmm4
   251     pminsw      xmm4, xmm1
   252     paddsw      xmm0, xmm4
   253     paddsw      xmm0, xmm2
   255     paddsw      xmm0, krd
   256     psraw       xmm0, 7
   257     packuswb    xmm0, xmm0
   258 %if %1
   259     movq        xmm1, [rdi]
   260     pavgb       xmm0, xmm1
   261 %endif
   262     movq        [rdi], xmm0
   264     movq        xmm0, [rsi + 8]             ;A
   265     movq        xmm1, [rsi + rdx + 8]       ;B
   266     movq        xmm2, [rsi + rdx * 2 + 8]   ;C
   267     movq        xmm3, [rax + rdx * 2 + 8]   ;D
   268     movq        xmm4, [rsi + rdx * 4 + 8]   ;E
   269     movq        xmm5, [rax + rdx * 4 + 8]   ;F
   271     punpcklbw   xmm0, xmm1                  ;A B
   272     punpcklbw   xmm2, xmm3                  ;C D
   273     punpcklbw   xmm4, xmm5                  ;E F
   276     movq        xmm6, [rsi + rbx + 8]       ;G
   277     movq        xmm7, [rax + rbx + 8]       ;H
   278     punpcklbw   xmm6, xmm7                  ;G H
   281     pmaddubsw   xmm0, k0k1
   282     pmaddubsw   xmm2, k2k3
   283     pmaddubsw   xmm4, k4k5
   284     pmaddubsw   xmm6, k6k7
   286     paddsw      xmm0, xmm6
   287     paddsw      xmm0, xmm2
   288     paddsw      xmm0, xmm4
   289     paddsw      xmm0, krd
   291     psraw       xmm0, 7
   292     packuswb    xmm0, xmm0
   294     add         rsi,  rdx
   295     add         rax,  rdx
   296 %if %1
   297     movq    xmm1, [rdi+8]
   298     pavgb   xmm0, xmm1
   299 %endif
   301     movq        [rdi+8], xmm0
   303 %if ABI_IS_32BIT
   304     add         rdi, DWORD PTR arg(3)       ;out_pitch
   305 %else
   306     add         rdi, r8
   307 %endif
   308     dec         rcx
   309     jnz         .loop
   310 %endm
   312 ;void vp9_filter_block1d8_v8_ssse3
   313 ;(
   314 ;    unsigned char *src_ptr,
   315 ;    unsigned int   src_pitch,
   316 ;    unsigned char *output_ptr,
   317 ;    unsigned int   out_pitch,
   318 ;    unsigned int   output_height,
   319 ;    short *filter
   320 ;)
   321 global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
   322 sym(vp9_filter_block1d4_v8_ssse3):
   323     push        rbp
   324     mov         rbp, rsp
   325     SHADOW_ARGS_TO_STACK 6
   326     SAVE_XMM 7
   327     push        rsi
   328     push        rdi
   329     push        rbx
   330     ; end prolog
   332     ALIGN_STACK 16, rax
   333     sub         rsp, 16*5
   334     %define k0k1 [rsp + 16*0]
   335     %define k2k3 [rsp + 16*1]
   336     %define k4k5 [rsp + 16*2]
   337     %define k6k7 [rsp + 16*3]
   338     %define krd [rsp + 16*4]
   340     VERTx4 0
   342     add rsp, 16*5
   343     pop rsp
   344     pop rbx
   345     ; begin epilog
   346     pop rdi
   347     pop rsi
   348     RESTORE_XMM
   349     UNSHADOW_ARGS
   350     pop         rbp
   351     ret
   353 ;void vp9_filter_block1d8_v8_ssse3
   354 ;(
   355 ;    unsigned char *src_ptr,
   356 ;    unsigned int   src_pitch,
   357 ;    unsigned char *output_ptr,
   358 ;    unsigned int   out_pitch,
   359 ;    unsigned int   output_height,
   360 ;    short *filter
   361 ;)
   362 global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
   363 sym(vp9_filter_block1d8_v8_ssse3):
   364     push        rbp
   365     mov         rbp, rsp
   366     SHADOW_ARGS_TO_STACK 6
   367     SAVE_XMM 7
   368     push        rsi
   369     push        rdi
   370     push        rbx
   371     ; end prolog
   373     ALIGN_STACK 16, rax
   374     sub         rsp, 16*5
   375     %define k0k1 [rsp + 16*0]
   376     %define k2k3 [rsp + 16*1]
   377     %define k4k5 [rsp + 16*2]
   378     %define k6k7 [rsp + 16*3]
   379     %define krd [rsp + 16*4]
   381     VERTx8 0
   383     add rsp, 16*5
   384     pop rsp
   385     pop rbx
   386     ; begin epilog
   387     pop rdi
   388     pop rsi
   389     RESTORE_XMM
   390     UNSHADOW_ARGS
   391     pop         rbp
   392     ret
   394 ;void vp9_filter_block1d16_v8_ssse3
   395 ;(
   396 ;    unsigned char *src_ptr,
   397 ;    unsigned int   src_pitch,
   398 ;    unsigned char *output_ptr,
   399 ;    unsigned int   out_pitch,
   400 ;    unsigned int   output_height,
   401 ;    short *filter
   402 ;)
   403 global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
   404 sym(vp9_filter_block1d16_v8_ssse3):
   405     push        rbp
   406     mov         rbp, rsp
   407     SHADOW_ARGS_TO_STACK 6
   408     SAVE_XMM 7
   409     push        rsi
   410     push        rdi
   411     push        rbx
   412     ; end prolog
   414     ALIGN_STACK 16, rax
   415     sub         rsp, 16*5
   416     %define k0k1 [rsp + 16*0]
   417     %define k2k3 [rsp + 16*1]
   418     %define k4k5 [rsp + 16*2]
   419     %define k6k7 [rsp + 16*3]
   420     %define krd [rsp + 16*4]
   422     VERTx16 0
   424     add rsp, 16*5
   425     pop rsp
   426     pop rbx
   427     ; begin epilog
   428     pop rdi
   429     pop rsi
   430     RESTORE_XMM
   431     UNSHADOW_ARGS
   432     pop         rbp
   433     ret
   435 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   438 global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE
   439 sym(vp9_filter_block1d4_v8_avg_ssse3):
   440     push        rbp
   441     mov         rbp, rsp
   442     SHADOW_ARGS_TO_STACK 6
   443     SAVE_XMM 7
   444     push        rsi
   445     push        rdi
   446     push        rbx
   447     ; end prolog
   449     ALIGN_STACK 16, rax
   450     sub         rsp, 16*5
   451     %define k0k1 [rsp + 16*0]
   452     %define k2k3 [rsp + 16*1]
   453     %define k4k5 [rsp + 16*2]
   454     %define k6k7 [rsp + 16*3]
   455     %define krd [rsp + 16*4]
   457     VERTx4 1
   459     add rsp, 16*5
   460     pop rsp
   461     pop rbx
   462     ; begin epilog
   463     pop rdi
   464     pop rsi
   465     RESTORE_XMM
   466     UNSHADOW_ARGS
   467     pop         rbp
   468     ret
   470 global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE
   471 sym(vp9_filter_block1d8_v8_avg_ssse3):
   472     push        rbp
   473     mov         rbp, rsp
   474     SHADOW_ARGS_TO_STACK 6
   475     SAVE_XMM 7
   476     push        rsi
   477     push        rdi
   478     push        rbx
   479     ; end prolog
   481     ALIGN_STACK 16, rax
   482     sub         rsp, 16*5
   483     %define k0k1 [rsp + 16*0]
   484     %define k2k3 [rsp + 16*1]
   485     %define k4k5 [rsp + 16*2]
   486     %define k6k7 [rsp + 16*3]
   487     %define krd [rsp + 16*4]
   489     VERTx8 1
   491     add rsp, 16*5
   492     pop rsp
   493     pop rbx
   494     ; begin epilog
   495     pop rdi
   496     pop rsi
   497     RESTORE_XMM
   498     UNSHADOW_ARGS
   499     pop         rbp
   500     ret
   502 global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE
   503 sym(vp9_filter_block1d16_v8_avg_ssse3):
   504     push        rbp
   505     mov         rbp, rsp
   506     SHADOW_ARGS_TO_STACK 6
   507     SAVE_XMM 7
   508     push        rsi
   509     push        rdi
   510     push        rbx
   511     ; end prolog
   513     ALIGN_STACK 16, rax
   514     sub         rsp, 16*5
   515     %define k0k1 [rsp + 16*0]
   516     %define k2k3 [rsp + 16*1]
   517     %define k4k5 [rsp + 16*2]
   518     %define k6k7 [rsp + 16*3]
   519     %define krd [rsp + 16*4]
   521     VERTx16 1
   523     add rsp, 16*5
   524     pop rsp
   525     pop rbx
   526     ; begin epilog
   527     pop rdi
   528     pop rsi
   529     RESTORE_XMM
   530     UNSHADOW_ARGS
   531     pop         rbp
   532     ret
   534 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   535 %macro HORIZx4_ROW 2
   536     movdqa      %2,   %1
   537     pshufb      %1,   [GLOBAL(shuf_t0t1)]
   538     pshufb      %2,   [GLOBAL(shuf_t2t3)]
   539     pmaddubsw   %1,   k0k1k4k5
   540     pmaddubsw   %2,   k2k3k6k7
   542     movdqa      xmm4, %1
   543     movdqa      xmm5, %2
   544     psrldq      %1,   8
   545     psrldq      %2,   8
   546     movdqa      xmm6, xmm5
   548     paddsw      xmm4, %2
   549     pmaxsw      xmm5, %1
   550     pminsw      %1, xmm6
   551     paddsw      %1, xmm4
   552     paddsw      %1, xmm5
   554     paddsw      %1,   krd
   555     psraw       %1,   7
   556     packuswb    %1,   %1
   557 %endm
   559 %macro HORIZx4 1
   560     mov         rdx, arg(5)                 ;filter ptr
   561     mov         rsi, arg(0)                 ;src_ptr
   562     mov         rdi, arg(2)                 ;output_ptr
   563     mov         rcx, 0x0400040
   565     movdqa      xmm4, [rdx]                 ;load filters
   566     movq        xmm5, rcx
   567     packsswb    xmm4, xmm4
   568     pshuflw     xmm6, xmm4, 0b              ;k0_k1
   569     pshufhw     xmm6, xmm6, 10101010b       ;k0_k1_k4_k5
   570     pshuflw     xmm7, xmm4, 01010101b       ;k2_k3
   571     pshufhw     xmm7, xmm7, 11111111b       ;k2_k3_k6_k7
   572     pshufd      xmm5, xmm5, 0               ;rounding
   574     movdqa      k0k1k4k5, xmm6
   575     movdqa      k2k3k6k7, xmm7
   576     movdqa      krd, xmm5
   578     movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
   579     movsxd      rdx, dword ptr arg(3)       ;output_pitch
   580     movsxd      rcx, dword ptr arg(4)       ;output_height
   581     shr         rcx, 1
   582 .loop:
   583     ;Do two rows once
   584     movq        xmm0,   [rsi - 3]           ;load src
   585     movq        xmm1,   [rsi + 5]
   586     movq        xmm2,   [rsi + rax - 3]
   587     movq        xmm3,   [rsi + rax + 5]
   588     punpcklqdq  xmm0,   xmm1
   589     punpcklqdq  xmm2,   xmm3
   591     HORIZx4_ROW xmm0,   xmm1
   592     HORIZx4_ROW xmm2,   xmm3
   593 %if %1
   594     movd        xmm1,   [rdi]
   595     pavgb       xmm0,   xmm1
   596     movd        xmm3,   [rdi + rdx]
   597     pavgb       xmm2,   xmm3
   598 %endif
   599     movd        [rdi],  xmm0
   600     movd        [rdi +rdx],  xmm2
   602     lea         rsi,    [rsi + rax]
   603     prefetcht0  [rsi + 4 * rax - 3]
   604     lea         rsi,    [rsi + rax]
   605     lea         rdi,    [rdi + 2 * rdx]
   606     prefetcht0  [rsi + 2 * rax - 3]
   608     dec         rcx
   609     jnz         .loop
   611     ; Do last row if output_height is odd
   612     movsxd      rcx,    dword ptr arg(4)       ;output_height
   613     and         rcx,    1
   614     je          .done
   616     movq        xmm0,   [rsi - 3]    ; load src
   617     movq        xmm1,   [rsi + 5]
   618     punpcklqdq  xmm0,   xmm1
   620     HORIZx4_ROW xmm0, xmm1
   621 %if %1
   622     movd        xmm1,   [rdi]
   623     pavgb       xmm0,   xmm1
   624 %endif
   625     movd        [rdi],  xmm0
   626 .done
   627 %endm
   629 %macro HORIZx8_ROW 4
   630     movdqa      %2,   %1
   631     movdqa      %3,   %1
   632     movdqa      %4,   %1
   634     pshufb      %1,   [GLOBAL(shuf_t0t1)]
   635     pshufb      %2,   [GLOBAL(shuf_t2t3)]
   636     pshufb      %3,   [GLOBAL(shuf_t4t5)]
   637     pshufb      %4,   [GLOBAL(shuf_t6t7)]
   639     pmaddubsw   %1,   k0k1
   640     pmaddubsw   %2,   k2k3
   641     pmaddubsw   %3,   k4k5
   642     pmaddubsw   %4,   k6k7
   644     paddsw      %1,   %4
   645     movdqa      %4,   %2
   646     pmaxsw      %2,   %3
   647     pminsw      %3,   %4
   648     paddsw      %1,   %3
   649     paddsw      %1,   %2
   651     paddsw      %1,   krd
   652     psraw       %1,   7
   653     packuswb    %1,   %1
   654 %endm
   656 %macro HORIZx8 1
   657     mov         rdx, arg(5)                 ;filter ptr
   658     mov         rsi, arg(0)                 ;src_ptr
   659     mov         rdi, arg(2)                 ;output_ptr
   660     mov         rcx, 0x0400040
   662     movdqa      xmm4, [rdx]                 ;load filters
   663     movd        xmm5, rcx
   664     packsswb    xmm4, xmm4
   665     pshuflw     xmm0, xmm4, 0b              ;k0_k1
   666     pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
   667     pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
   668     pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
   670     punpcklqdq  xmm0, xmm0
   671     punpcklqdq  xmm1, xmm1
   672     punpcklqdq  xmm2, xmm2
   673     punpcklqdq  xmm3, xmm3
   675     movdqa      k0k1, xmm0
   676     movdqa      k2k3, xmm1
   677     pshufd      xmm5, xmm5, 0
   678     movdqa      k4k5, xmm2
   679     movdqa      k6k7, xmm3
   680     movdqa      krd, xmm5
   682     movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
   683     movsxd      rdx, dword ptr arg(3)       ;output_pitch
   684     movsxd      rcx, dword ptr arg(4)       ;output_height
   685     shr         rcx, 1
   687 .loop:
   688     movq        xmm0,   [rsi - 3]           ;load src
   689     movq        xmm3,   [rsi + 5]
   690     movq        xmm4,   [rsi + rax - 3]
   691     movq        xmm7,   [rsi + rax + 5]
   692     punpcklqdq  xmm0,   xmm3
   693     punpcklqdq  xmm4,   xmm7
   695     HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
   696     HORIZx8_ROW xmm4, xmm5, xmm6, xmm7
   697 %if %1
   698     movq        xmm1,   [rdi]
   699     movq        xmm2,   [rdi + rdx]
   700     pavgb       xmm0,   xmm1
   701     pavgb       xmm4,   xmm2
   702 %endif
   703     movq        [rdi],  xmm0
   704     movq        [rdi + rdx],  xmm4
   706     lea         rsi,    [rsi + rax]
   707     prefetcht0  [rsi + 4 * rax - 3]
   708     lea         rsi,    [rsi + rax]
   709     lea         rdi,    [rdi + 2 * rdx]
   710     prefetcht0  [rsi + 2 * rax - 3]
   711     dec         rcx
   712     jnz         .loop
   714     ;Do last row if output_height is odd
   715     movsxd      rcx,    dword ptr arg(4)    ;output_height
   716     and         rcx,    1
   717     je          .done
   719     movq        xmm0,   [rsi - 3]
   720     movq        xmm3,   [rsi + 5]
   721     punpcklqdq  xmm0,   xmm3
   723     HORIZx8_ROW xmm0, xmm1, xmm2, xmm3
   724 %if %1
   725     movq        xmm1,   [rdi]
   726     pavgb       xmm0,   xmm1
   727 %endif
   728     movq        [rdi],  xmm0
   729 .done
   730 %endm
   732 %macro HORIZx16 1
   733     mov         rdx, arg(5)                 ;filter ptr
   734     mov         rsi, arg(0)                 ;src_ptr
   735     mov         rdi, arg(2)                 ;output_ptr
   736     mov         rcx, 0x0400040
   738     movdqa      xmm4, [rdx]                 ;load filters
   739     movq        xmm5, rcx
   740     packsswb    xmm4, xmm4
   741     pshuflw     xmm0, xmm4, 0b              ;k0_k1
   742     pshuflw     xmm1, xmm4, 01010101b       ;k2_k3
   743     pshuflw     xmm2, xmm4, 10101010b       ;k4_k5
   744     pshuflw     xmm3, xmm4, 11111111b       ;k6_k7
   746     punpcklqdq  xmm0, xmm0
   747     punpcklqdq  xmm1, xmm1
   748     punpcklqdq  xmm2, xmm2
   749     punpcklqdq  xmm3, xmm3
   751     movdqa      k0k1, xmm0
   752     movdqa      k2k3, xmm1
   753     pshufd      xmm5, xmm5, 0
   754     movdqa      k4k5, xmm2
   755     movdqa      k6k7, xmm3
   756     movdqa      krd, xmm5
   758     movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
   759     movsxd      rdx, dword ptr arg(3)       ;output_pitch
   760     movsxd      rcx, dword ptr arg(4)       ;output_height
   762 .loop:
   763     prefetcht0  [rsi + 2 * rax -3]
   765     movq        xmm0,   [rsi - 3]           ;load src data
   766     movq        xmm4,   [rsi + 5]
   767     movq        xmm7,   [rsi + 13]
   768     punpcklqdq  xmm0,   xmm4
   769     punpcklqdq  xmm4,   xmm7
   771     movdqa      xmm1,   xmm0
   772     movdqa      xmm2,   xmm0
   773     movdqa      xmm3,   xmm0
   774     movdqa      xmm5,   xmm4
   775     movdqa      xmm6,   xmm4
   776     movdqa      xmm7,   xmm4
   778     pshufb      xmm0,   [GLOBAL(shuf_t0t1)]
   779     pshufb      xmm1,   [GLOBAL(shuf_t2t3)]
   780     pshufb      xmm2,   [GLOBAL(shuf_t4t5)]
   781     pshufb      xmm3,   [GLOBAL(shuf_t6t7)]
   782     pshufb      xmm4,   [GLOBAL(shuf_t0t1)]
   783     pshufb      xmm5,   [GLOBAL(shuf_t2t3)]
   784     pshufb      xmm6,   [GLOBAL(shuf_t4t5)]
   785     pshufb      xmm7,   [GLOBAL(shuf_t6t7)]
   787     pmaddubsw   xmm0,   k0k1
   788     pmaddubsw   xmm1,   k2k3
   789     pmaddubsw   xmm2,   k4k5
   790     pmaddubsw   xmm3,   k6k7
   791     pmaddubsw   xmm4,   k0k1
   792     pmaddubsw   xmm5,   k2k3
   793     pmaddubsw   xmm6,   k4k5
   794     pmaddubsw   xmm7,   k6k7
   796     paddsw      xmm0,   xmm3
   797     movdqa      xmm3,   xmm1
   798     pmaxsw      xmm1,   xmm2
   799     pminsw      xmm2,   xmm3
   800     paddsw      xmm0,   xmm2
   801     paddsw      xmm0,   xmm1
   803     paddsw      xmm4,   xmm7
   804     movdqa      xmm7,   xmm5
   805     pmaxsw      xmm5,   xmm6
   806     pminsw      xmm6,   xmm7
   807     paddsw      xmm4,   xmm6
   808     paddsw      xmm4,   xmm5
   810     paddsw      xmm0,   krd
   811     paddsw      xmm4,   krd
   812     psraw       xmm0,   7
   813     psraw       xmm4,   7
   814     packuswb    xmm0,   xmm0
   815     packuswb    xmm4,   xmm4
   816     punpcklqdq  xmm0,   xmm4
   817 %if %1
   818     movdqa      xmm1,   [rdi]
   819     pavgb       xmm0,   xmm1
   820 %endif
   822     lea         rsi,    [rsi + rax]
   823     movdqa      [rdi],  xmm0
   825     lea         rdi,    [rdi + rdx]
   826     dec         rcx
   827     jnz         .loop
   828 %endm
   830 ;void vp9_filter_block1d4_h8_ssse3
   831 ;(
   832 ;    unsigned char  *src_ptr,
   833 ;    unsigned int    src_pixels_per_line,
   834 ;    unsigned char  *output_ptr,
   835 ;    unsigned int    output_pitch,
   836 ;    unsigned int    output_height,
   837 ;    short *filter
   838 ;)
   839 global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
   840 sym(vp9_filter_block1d4_h8_ssse3):
   841     push        rbp
   842     mov         rbp, rsp
   843     SHADOW_ARGS_TO_STACK 6
   844     SAVE_XMM 7
   845     GET_GOT     rbx
   846     push        rsi
   847     push        rdi
   848     ; end prolog
   850     ALIGN_STACK 16, rax
   851     sub         rsp, 16 * 3
   852     %define k0k1k4k5 [rsp + 16 * 0]
   853     %define k2k3k6k7 [rsp + 16 * 1]
   854     %define krd      [rsp + 16 * 2]
   856     HORIZx4 0
   858     add rsp, 16 * 3
   859     pop rsp
   860     ; begin epilog
   861     pop rdi
   862     pop rsi
   863     RESTORE_GOT
   864     RESTORE_XMM
   865     UNSHADOW_ARGS
   866     pop         rbp
   867     ret
   869 ;void vp9_filter_block1d8_h8_ssse3
   870 ;(
   871 ;    unsigned char  *src_ptr,
   872 ;    unsigned int    src_pixels_per_line,
   873 ;    unsigned char  *output_ptr,
   874 ;    unsigned int    output_pitch,
   875 ;    unsigned int    output_height,
   876 ;    short *filter
   877 ;)
   878 global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
   879 sym(vp9_filter_block1d8_h8_ssse3):
   880     push        rbp
   881     mov         rbp, rsp
   882     SHADOW_ARGS_TO_STACK 6
   883     SAVE_XMM 7
   884     GET_GOT     rbx
   885     push        rsi
   886     push        rdi
   887     ; end prolog
   889     ALIGN_STACK 16, rax
   890     sub         rsp, 16*5
   891     %define k0k1 [rsp + 16*0]
   892     %define k2k3 [rsp + 16*1]
   893     %define k4k5 [rsp + 16*2]
   894     %define k6k7 [rsp + 16*3]
   895     %define krd [rsp + 16*4]
   897     HORIZx8 0
   899     add rsp, 16*5
   900     pop rsp
   902     ; begin epilog
   903     pop rdi
   904     pop rsi
   905     RESTORE_GOT
   906     RESTORE_XMM
   907     UNSHADOW_ARGS
   908     pop         rbp
   909     ret
   911 ;void vp9_filter_block1d16_h8_ssse3
   912 ;(
   913 ;    unsigned char  *src_ptr,
   914 ;    unsigned int    src_pixels_per_line,
   915 ;    unsigned char  *output_ptr,
   916 ;    unsigned int    output_pitch,
   917 ;    unsigned int    output_height,
   918 ;    short *filter
   919 ;)
   920 global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
   921 sym(vp9_filter_block1d16_h8_ssse3):
   922     push        rbp
   923     mov         rbp, rsp
   924     SHADOW_ARGS_TO_STACK 6
   925     SAVE_XMM 7
   926     GET_GOT     rbx
   927     push        rsi
   928     push        rdi
   929     ; end prolog
   931     ALIGN_STACK 16, rax
   932     sub         rsp, 16*5
   933     %define k0k1 [rsp + 16*0]
   934     %define k2k3 [rsp + 16*1]
   935     %define k4k5 [rsp + 16*2]
   936     %define k6k7 [rsp + 16*3]
   937     %define krd [rsp + 16*4]
   939     HORIZx16 0
   941     add rsp, 16*5
   942     pop rsp
   944     ; begin epilog
   945     pop rdi
   946     pop rsi
   947     RESTORE_GOT
   948     RESTORE_XMM
   949     UNSHADOW_ARGS
   950     pop         rbp
   951     ret
   953 global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE
   954 sym(vp9_filter_block1d4_h8_avg_ssse3):
   955     push        rbp
   956     mov         rbp, rsp
   957     SHADOW_ARGS_TO_STACK 6
   958     SAVE_XMM 7
   959     GET_GOT     rbx
   960     push        rsi
   961     push        rdi
   962     ; end prolog
   964     ALIGN_STACK 16, rax
   965     sub         rsp, 16 * 3
   966     %define k0k1k4k5 [rsp + 16 * 0]
   967     %define k2k3k6k7 [rsp + 16 * 1]
   968     %define krd      [rsp + 16 * 2]
   970     HORIZx4 1
   972     add rsp, 16 * 3
   973     pop rsp
   974     ; begin epilog
   975     pop rdi
   976     pop rsi
   977     RESTORE_GOT
   978     RESTORE_XMM
   979     UNSHADOW_ARGS
   980     pop         rbp
   981     ret
   983 global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
   984 sym(vp9_filter_block1d8_h8_avg_ssse3):
   985     push        rbp
   986     mov         rbp, rsp
   987     SHADOW_ARGS_TO_STACK 6
   988     SAVE_XMM 7
   989     GET_GOT     rbx
   990     push        rsi
   991     push        rdi
   992     ; end prolog
   994     ALIGN_STACK 16, rax
   995     sub         rsp, 16*5
   996     %define k0k1 [rsp + 16*0]
   997     %define k2k3 [rsp + 16*1]
   998     %define k4k5 [rsp + 16*2]
   999     %define k6k7 [rsp + 16*3]
  1000     %define krd [rsp + 16*4]
  1002     HORIZx8 1
  1004     add rsp, 16*5
  1005     pop rsp
  1007     ; begin epilog
  1008     pop rdi
  1009     pop rsi
  1010     RESTORE_GOT
  1011     RESTORE_XMM
  1012     UNSHADOW_ARGS
  1013     pop         rbp
  1014     ret
  1016 global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE
  1017 sym(vp9_filter_block1d16_h8_avg_ssse3):
  1018     push        rbp
  1019     mov         rbp, rsp
  1020     SHADOW_ARGS_TO_STACK 6
  1021     SAVE_XMM 7
  1022     GET_GOT     rbx
  1023     push        rsi
  1024     push        rdi
  1025     ; end prolog
  1027     ALIGN_STACK 16, rax
  1028     sub         rsp, 16*5
  1029     %define k0k1 [rsp + 16*0]
  1030     %define k2k3 [rsp + 16*1]
  1031     %define k4k5 [rsp + 16*2]
  1032     %define k6k7 [rsp + 16*3]
  1033     %define krd [rsp + 16*4]
  1035     HORIZx16 1
  1037     add rsp, 16*5
  1038     pop rsp
  1040     ; begin epilog
  1041     pop rdi
  1042     pop rsi
  1043     RESTORE_GOT
  1044     RESTORE_XMM
  1045     UNSHADOW_ARGS
  1046     pop         rbp
  1047     ret
  1048 SECTION_RODATA
  1049 align 16
  1050 shuf_t0t1:
  1051     db  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
  1052 align 16
  1053 shuf_t2t3:
  1054     db  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
  1055 align 16
  1056 shuf_t4t5:
  1057     db  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
  1058 align 16
  1059 shuf_t6t7:
  1060     db  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14

mercurial