media/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    14 ;Note: tap3 and tap4 have to be applied and added after other taps to avoid
    15 ;overflow.
    17 %macro GET_FILTERS_4 0
    18     mov         rdx, arg(5)                 ;filter ptr
    19     mov         rcx, 0x0400040
    21     movdqa      xmm7, [rdx]                 ;load filters
    22     pshuflw     xmm0, xmm7, 0b              ;k0
    23     pshuflw     xmm1, xmm7, 01010101b       ;k1
    24     pshuflw     xmm2, xmm7, 10101010b       ;k2
    25     pshuflw     xmm3, xmm7, 11111111b       ;k3
    26     psrldq      xmm7, 8
    27     pshuflw     xmm4, xmm7, 0b              ;k4
    28     pshuflw     xmm5, xmm7, 01010101b       ;k5
    29     pshuflw     xmm6, xmm7, 10101010b       ;k6
    30     pshuflw     xmm7, xmm7, 11111111b       ;k7
    32     punpcklqdq  xmm0, xmm1
    33     punpcklqdq  xmm2, xmm3
    34     punpcklqdq  xmm5, xmm4
    35     punpcklqdq  xmm6, xmm7
    37     movdqa      k0k1, xmm0
    38     movdqa      k2k3, xmm2
    39     movdqa      k5k4, xmm5
    40     movdqa      k6k7, xmm6
    42     movq        xmm6, rcx
    43     pshufd      xmm6, xmm6, 0
    44     movdqa      krd, xmm6
    46     pxor        xmm7, xmm7
    47     movdqa      zero, xmm7
    48 %endm
    50 %macro APPLY_FILTER_4 1
    51     punpckldq   xmm0, xmm1                  ;two row in one register
    52     punpckldq   xmm6, xmm7
    53     punpckldq   xmm2, xmm3
    54     punpckldq   xmm5, xmm4
    56     punpcklbw   xmm0, zero                  ;unpack to word
    57     punpcklbw   xmm6, zero
    58     punpcklbw   xmm2, zero
    59     punpcklbw   xmm5, zero
    61     pmullw      xmm0, k0k1                  ;multiply the filter factors
    62     pmullw      xmm6, k6k7
    63     pmullw      xmm2, k2k3
    64     pmullw      xmm5, k5k4
    66     paddsw      xmm0, xmm6                  ;sum
    67     movdqa      xmm1, xmm0
    68     psrldq      xmm1, 8
    69     paddsw      xmm0, xmm1
    70     paddsw      xmm0, xmm2
    71     psrldq      xmm2, 8
    72     paddsw      xmm0, xmm5
    73     psrldq      xmm5, 8
    74     paddsw      xmm0, xmm2
    75     paddsw      xmm0, xmm5
    77     paddsw      xmm0, krd                   ;rounding
    78     psraw       xmm0, 7                     ;shift
    79     packuswb    xmm0, xmm0                  ;pack to byte
    81 %if %1
    82     movd        xmm1, [rdi]
    83     pavgb       xmm0, xmm1
    84 %endif
    85     movd        [rdi], xmm0
    86 %endm
    88 %macro GET_FILTERS 0
    89     mov         rdx, arg(5)                 ;filter ptr
    90     mov         rsi, arg(0)                 ;src_ptr
    91     mov         rdi, arg(2)                 ;output_ptr
    92     mov         rcx, 0x0400040
    94     movdqa      xmm7, [rdx]                 ;load filters
    95     pshuflw     xmm0, xmm7, 0b              ;k0
    96     pshuflw     xmm1, xmm7, 01010101b       ;k1
    97     pshuflw     xmm2, xmm7, 10101010b       ;k2
    98     pshuflw     xmm3, xmm7, 11111111b       ;k3
    99     pshufhw     xmm4, xmm7, 0b              ;k4
   100     pshufhw     xmm5, xmm7, 01010101b       ;k5
   101     pshufhw     xmm6, xmm7, 10101010b       ;k6
   102     pshufhw     xmm7, xmm7, 11111111b       ;k7
   104     punpcklwd   xmm0, xmm0
   105     punpcklwd   xmm1, xmm1
   106     punpcklwd   xmm2, xmm2
   107     punpcklwd   xmm3, xmm3
   108     punpckhwd   xmm4, xmm4
   109     punpckhwd   xmm5, xmm5
   110     punpckhwd   xmm6, xmm6
   111     punpckhwd   xmm7, xmm7
   113     movdqa      k0,   xmm0                  ;store filter factors on stack
   114     movdqa      k1,   xmm1
   115     movdqa      k2,   xmm2
   116     movdqa      k3,   xmm3
   117     movdqa      k4,   xmm4
   118     movdqa      k5,   xmm5
   119     movdqa      k6,   xmm6
   120     movdqa      k7,   xmm7
   122     movq        xmm6, rcx
   123     pshufd      xmm6, xmm6, 0
   124     movdqa      krd, xmm6                   ;rounding
   126     pxor        xmm7, xmm7
   127     movdqa      zero, xmm7
   128 %endm
   130 %macro LOAD_VERT_8 1
   131     movq        xmm0, [rsi + %1]            ;0
   132     movq        xmm1, [rsi + rax + %1]      ;1
   133     movq        xmm6, [rsi + rdx * 2 + %1]  ;6
   134     lea         rsi,  [rsi + rax]
   135     movq        xmm7, [rsi + rdx * 2 + %1]  ;7
   136     movq        xmm2, [rsi + rax + %1]      ;2
   137     movq        xmm3, [rsi + rax * 2 + %1]  ;3
   138     movq        xmm4, [rsi + rdx + %1]      ;4
   139     movq        xmm5, [rsi + rax * 4 + %1]  ;5
   140 %endm
   142 %macro APPLY_FILTER_8 2
   143     punpcklbw   xmm0, zero
   144     punpcklbw   xmm1, zero
   145     punpcklbw   xmm6, zero
   146     punpcklbw   xmm7, zero
   147     punpcklbw   xmm2, zero
   148     punpcklbw   xmm5, zero
   149     punpcklbw   xmm3, zero
   150     punpcklbw   xmm4, zero
   152     pmullw      xmm0, k0
   153     pmullw      xmm1, k1
   154     pmullw      xmm6, k6
   155     pmullw      xmm7, k7
   156     pmullw      xmm2, k2
   157     pmullw      xmm5, k5
   158     pmullw      xmm3, k3
   159     pmullw      xmm4, k4
   161     paddsw      xmm0, xmm1
   162     paddsw      xmm0, xmm6
   163     paddsw      xmm0, xmm7
   164     paddsw      xmm0, xmm2
   165     paddsw      xmm0, xmm5
   166     paddsw      xmm0, xmm3
   167     paddsw      xmm0, xmm4
   169     paddsw      xmm0, krd                   ;rounding
   170     psraw       xmm0, 7                     ;shift
   171     packuswb    xmm0, xmm0                  ;pack back to byte
   172 %if %1
   173     movq        xmm1, [rdi + %2]
   174     pavgb       xmm0, xmm1
   175 %endif
   176     movq        [rdi + %2], xmm0
   177 %endm
   179 ;void vp9_filter_block1d4_v8_sse2
   180 ;(
   181 ;    unsigned char *src_ptr,
   182 ;    unsigned int   src_pitch,
   183 ;    unsigned char *output_ptr,
   184 ;    unsigned int   out_pitch,
   185 ;    unsigned int   output_height,
   186 ;    short *filter
   187 ;)
   188 global sym(vp9_filter_block1d4_v8_sse2) PRIVATE
   189 sym(vp9_filter_block1d4_v8_sse2):
   190     push        rbp
   191     mov         rbp, rsp
   192     SHADOW_ARGS_TO_STACK 6
   193     SAVE_XMM 7
   194     push        rsi
   195     push        rdi
   196     push        rbx
   197     ; end prolog
   199     ALIGN_STACK 16, rax
   200     sub         rsp, 16 * 6
   201     %define k0k1 [rsp + 16 * 0]
   202     %define k2k3 [rsp + 16 * 1]
   203     %define k5k4 [rsp + 16 * 2]
   204     %define k6k7 [rsp + 16 * 3]
   205     %define krd [rsp + 16 * 4]
   206     %define zero [rsp + 16 * 5]
   208     GET_FILTERS_4
   210     mov         rsi, arg(0)                 ;src_ptr
   211     mov         rdi, arg(2)                 ;output_ptr
   213     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
   214     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
   215     lea         rdx, [rax + rax * 2]
   216     movsxd      rcx, DWORD PTR arg(4)       ;output_height
   218 .loop:
   219     movd        xmm0, [rsi]                 ;load src: row 0
   220     movd        xmm1, [rsi + rax]           ;1
   221     movd        xmm6, [rsi + rdx * 2]       ;6
   222     lea         rsi,  [rsi + rax]
   223     movd        xmm7, [rsi + rdx * 2]       ;7
   224     movd        xmm2, [rsi + rax]           ;2
   225     movd        xmm3, [rsi + rax * 2]       ;3
   226     movd        xmm4, [rsi + rdx]           ;4
   227     movd        xmm5, [rsi + rax * 4]       ;5
   229     APPLY_FILTER_4 0
   231     lea         rdi, [rdi + rbx]
   232     dec         rcx
   233     jnz         .loop
   235     add rsp, 16 * 6
   236     pop rsp
   237     pop rbx
   238     ; begin epilog
   239     pop rdi
   240     pop rsi
   241     RESTORE_XMM
   242     UNSHADOW_ARGS
   243     pop         rbp
   244     ret
   246 ;void vp9_filter_block1d8_v8_sse2
   247 ;(
   248 ;    unsigned char *src_ptr,
   249 ;    unsigned int   src_pitch,
   250 ;    unsigned char *output_ptr,
   251 ;    unsigned int   out_pitch,
   252 ;    unsigned int   output_height,
   253 ;    short *filter
   254 ;)
   255 global sym(vp9_filter_block1d8_v8_sse2) PRIVATE
   256 sym(vp9_filter_block1d8_v8_sse2):
   257     push        rbp
   258     mov         rbp, rsp
   259     SHADOW_ARGS_TO_STACK 6
   260     SAVE_XMM 7
   261     push        rsi
   262     push        rdi
   263     push        rbx
   264     ; end prolog
   266     ALIGN_STACK 16, rax
   267     sub         rsp, 16 * 10
   268     %define k0 [rsp + 16 * 0]
   269     %define k1 [rsp + 16 * 1]
   270     %define k2 [rsp + 16 * 2]
   271     %define k3 [rsp + 16 * 3]
   272     %define k4 [rsp + 16 * 4]
   273     %define k5 [rsp + 16 * 5]
   274     %define k6 [rsp + 16 * 6]
   275     %define k7 [rsp + 16 * 7]
   276     %define krd [rsp + 16 * 8]
   277     %define zero [rsp + 16 * 9]
   279     GET_FILTERS
   281     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
   282     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
   283     lea         rdx, [rax + rax * 2]
   284     movsxd      rcx, DWORD PTR arg(4)       ;output_height
   286 .loop:
   287     LOAD_VERT_8 0
   288     APPLY_FILTER_8 0, 0
   290     lea         rdi, [rdi + rbx]
   291     dec         rcx
   292     jnz         .loop
   294     add rsp, 16 * 10
   295     pop rsp
   296     pop rbx
   297     ; begin epilog
   298     pop rdi
   299     pop rsi
   300     RESTORE_XMM
   301     UNSHADOW_ARGS
   302     pop         rbp
   303     ret
   305 ;void vp9_filter_block1d16_v8_sse2
   306 ;(
   307 ;    unsigned char *src_ptr,
   308 ;    unsigned int   src_pitch,
   309 ;    unsigned char *output_ptr,
   310 ;    unsigned int   out_pitch,
   311 ;    unsigned int   output_height,
   312 ;    short *filter
   313 ;)
   314 global sym(vp9_filter_block1d16_v8_sse2) PRIVATE
   315 sym(vp9_filter_block1d16_v8_sse2):
   316     push        rbp
   317     mov         rbp, rsp
   318     SHADOW_ARGS_TO_STACK 6
   319     SAVE_XMM 7
   320     push        rsi
   321     push        rdi
   322     push        rbx
   323     ; end prolog
   325     ALIGN_STACK 16, rax
   326     sub         rsp, 16 * 10
   327     %define k0 [rsp + 16 * 0]
   328     %define k1 [rsp + 16 * 1]
   329     %define k2 [rsp + 16 * 2]
   330     %define k3 [rsp + 16 * 3]
   331     %define k4 [rsp + 16 * 4]
   332     %define k5 [rsp + 16 * 5]
   333     %define k6 [rsp + 16 * 6]
   334     %define k7 [rsp + 16 * 7]
   335     %define krd [rsp + 16 * 8]
   336     %define zero [rsp + 16 * 9]
   338     GET_FILTERS
   340     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
   341     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
   342     lea         rdx, [rax + rax * 2]
   343     movsxd      rcx, DWORD PTR arg(4)       ;output_height
   345 .loop:
   346     LOAD_VERT_8 0
   347     APPLY_FILTER_8 0, 0
   348     sub         rsi, rax
   350     LOAD_VERT_8 8
   351     APPLY_FILTER_8 0, 8
   352     add         rdi, rbx
   354     dec         rcx
   355     jnz         .loop
   357     add rsp, 16 * 10
   358     pop rsp
   359     pop rbx
   360     ; begin epilog
   361     pop rdi
   362     pop rsi
   363     RESTORE_XMM
   364     UNSHADOW_ARGS
   365     pop         rbp
   366     ret
   368 global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE
   369 sym(vp9_filter_block1d4_v8_avg_sse2):
   370     push        rbp
   371     mov         rbp, rsp
   372     SHADOW_ARGS_TO_STACK 6
   373     SAVE_XMM 7
   374     push        rsi
   375     push        rdi
   376     push        rbx
   377     ; end prolog
   379     ALIGN_STACK 16, rax
   380     sub         rsp, 16 * 6
   381     %define k0k1 [rsp + 16 * 0]
   382     %define k2k3 [rsp + 16 * 1]
   383     %define k5k4 [rsp + 16 * 2]
   384     %define k6k7 [rsp + 16 * 3]
   385     %define krd [rsp + 16 * 4]
   386     %define zero [rsp + 16 * 5]
   388     GET_FILTERS_4
   390     mov         rsi, arg(0)                 ;src_ptr
   391     mov         rdi, arg(2)                 ;output_ptr
   393     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
   394     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
   395     lea         rdx, [rax + rax * 2]
   396     movsxd      rcx, DWORD PTR arg(4)       ;output_height
   398 .loop:
   399     movd        xmm0, [rsi]                 ;load src: row 0
   400     movd        xmm1, [rsi + rax]           ;1
   401     movd        xmm6, [rsi + rdx * 2]       ;6
   402     lea         rsi,  [rsi + rax]
   403     movd        xmm7, [rsi + rdx * 2]       ;7
   404     movd        xmm2, [rsi + rax]           ;2
   405     movd        xmm3, [rsi + rax * 2]       ;3
   406     movd        xmm4, [rsi + rdx]           ;4
   407     movd        xmm5, [rsi + rax * 4]       ;5
   409     APPLY_FILTER_4 1
   411     lea         rdi, [rdi + rbx]
   412     dec         rcx
   413     jnz         .loop
   415     add rsp, 16 * 6
   416     pop rsp
   417     pop rbx
   418     ; begin epilog
   419     pop rdi
   420     pop rsi
   421     RESTORE_XMM
   422     UNSHADOW_ARGS
   423     pop         rbp
   424     ret
   426 global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE
   427 sym(vp9_filter_block1d8_v8_avg_sse2):
   428     push        rbp
   429     mov         rbp, rsp
   430     SHADOW_ARGS_TO_STACK 6
   431     SAVE_XMM 7
   432     push        rsi
   433     push        rdi
   434     push        rbx
   435     ; end prolog
   437     ALIGN_STACK 16, rax
   438     sub         rsp, 16 * 10
   439     %define k0 [rsp + 16 * 0]
   440     %define k1 [rsp + 16 * 1]
   441     %define k2 [rsp + 16 * 2]
   442     %define k3 [rsp + 16 * 3]
   443     %define k4 [rsp + 16 * 4]
   444     %define k5 [rsp + 16 * 5]
   445     %define k6 [rsp + 16 * 6]
   446     %define k7 [rsp + 16 * 7]
   447     %define krd [rsp + 16 * 8]
   448     %define zero [rsp + 16 * 9]
   450     GET_FILTERS
   452     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
   453     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
   454     lea         rdx, [rax + rax * 2]
   455     movsxd      rcx, DWORD PTR arg(4)       ;output_height
   456 .loop:
   457     LOAD_VERT_8 0
   458     APPLY_FILTER_8 1, 0
   460     lea         rdi, [rdi + rbx]
   461     dec         rcx
   462     jnz         .loop
   464     add rsp, 16 * 10
   465     pop rsp
   466     pop rbx
   467     ; begin epilog
   468     pop rdi
   469     pop rsi
   470     RESTORE_XMM
   471     UNSHADOW_ARGS
   472     pop         rbp
   473     ret
   475 global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE
   476 sym(vp9_filter_block1d16_v8_avg_sse2):
   477     push        rbp
   478     mov         rbp, rsp
   479     SHADOW_ARGS_TO_STACK 6
   480     SAVE_XMM 7
   481     push        rsi
   482     push        rdi
   483     push        rbx
   484     ; end prolog
   486     ALIGN_STACK 16, rax
   487     sub         rsp, 16 * 10
   488     %define k0 [rsp + 16 * 0]
   489     %define k1 [rsp + 16 * 1]
   490     %define k2 [rsp + 16 * 2]
   491     %define k3 [rsp + 16 * 3]
   492     %define k4 [rsp + 16 * 4]
   493     %define k5 [rsp + 16 * 5]
   494     %define k6 [rsp + 16 * 6]
   495     %define k7 [rsp + 16 * 7]
   496     %define krd [rsp + 16 * 8]
   497     %define zero [rsp + 16 * 9]
   499     GET_FILTERS
   501     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
   502     movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
   503     lea         rdx, [rax + rax * 2]
   504     movsxd      rcx, DWORD PTR arg(4)       ;output_height
   505 .loop:
   506     LOAD_VERT_8 0
   507     APPLY_FILTER_8 1, 0
   508     sub         rsi, rax
   510     LOAD_VERT_8 8
   511     APPLY_FILTER_8 1, 8
   512     add         rdi, rbx
   514     dec         rcx
   515     jnz         .loop
   517     add rsp, 16 * 10
   518     pop rsp
   519     pop rbx
   520     ; begin epilog
   521     pop rdi
   522     pop rsi
   523     RESTORE_XMM
   524     UNSHADOW_ARGS
   525     pop         rbp
   526     ret
   528 ;void vp9_filter_block1d4_h8_sse2
   529 ;(
   530 ;    unsigned char  *src_ptr,
   531 ;    unsigned int    src_pixels_per_line,
   532 ;    unsigned char  *output_ptr,
   533 ;    unsigned int    output_pitch,
   534 ;    unsigned int    output_height,
   535 ;    short *filter
   536 ;)
   537 global sym(vp9_filter_block1d4_h8_sse2) PRIVATE
   538 sym(vp9_filter_block1d4_h8_sse2):
   539     push        rbp
   540     mov         rbp, rsp
   541     SHADOW_ARGS_TO_STACK 6
   542     SAVE_XMM 7
   543     push        rsi
   544     push        rdi
   545     ; end prolog
   547     ALIGN_STACK 16, rax
   548     sub         rsp, 16 * 6
   549     %define k0k1 [rsp + 16 * 0]
   550     %define k2k3 [rsp + 16 * 1]
   551     %define k5k4 [rsp + 16 * 2]
   552     %define k6k7 [rsp + 16 * 3]
   553     %define krd [rsp + 16 * 4]
   554     %define zero [rsp + 16 * 5]
   556     GET_FILTERS_4
   558     mov         rsi, arg(0)                 ;src_ptr
   559     mov         rdi, arg(2)                 ;output_ptr
   561     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
   562     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
   563     movsxd      rcx, DWORD PTR arg(4)       ;output_height
   565 .loop:
   566     movdqu      xmm0,   [rsi - 3]           ;load src
   568     movdqa      xmm1, xmm0
   569     movdqa      xmm6, xmm0
   570     movdqa      xmm7, xmm0
   571     movdqa      xmm2, xmm0
   572     movdqa      xmm3, xmm0
   573     movdqa      xmm5, xmm0
   574     movdqa      xmm4, xmm0
   576     psrldq      xmm1, 1
   577     psrldq      xmm6, 6
   578     psrldq      xmm7, 7
   579     psrldq      xmm2, 2
   580     psrldq      xmm3, 3
   581     psrldq      xmm5, 5
   582     psrldq      xmm4, 4
   584     APPLY_FILTER_4 0
   586     lea         rsi, [rsi + rax]
   587     lea         rdi, [rdi + rdx]
   588     dec         rcx
   589     jnz         .loop
   591     add rsp, 16 * 6
   592     pop rsp
   594     ; begin epilog
   595     pop rdi
   596     pop rsi
   597     RESTORE_XMM
   598     UNSHADOW_ARGS
   599     pop         rbp
   600     ret
   602 ;void vp9_filter_block1d8_h8_sse2
   603 ;(
   604 ;    unsigned char  *src_ptr,
   605 ;    unsigned int    src_pixels_per_line,
   606 ;    unsigned char  *output_ptr,
   607 ;    unsigned int    output_pitch,
   608 ;    unsigned int    output_height,
   609 ;    short *filter
   610 ;)
   611 global sym(vp9_filter_block1d8_h8_sse2) PRIVATE
   612 sym(vp9_filter_block1d8_h8_sse2):
   613     push        rbp
   614     mov         rbp, rsp
   615     SHADOW_ARGS_TO_STACK 6
   616     SAVE_XMM 7
   617     push        rsi
   618     push        rdi
   619     ; end prolog
   621     ALIGN_STACK 16, rax
   622     sub         rsp, 16 * 10
   623     %define k0 [rsp + 16 * 0]
   624     %define k1 [rsp + 16 * 1]
   625     %define k2 [rsp + 16 * 2]
   626     %define k3 [rsp + 16 * 3]
   627     %define k4 [rsp + 16 * 4]
   628     %define k5 [rsp + 16 * 5]
   629     %define k6 [rsp + 16 * 6]
   630     %define k7 [rsp + 16 * 7]
   631     %define krd [rsp + 16 * 8]
   632     %define zero [rsp + 16 * 9]
   634     GET_FILTERS
   636     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
   637     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
   638     movsxd      rcx, DWORD PTR arg(4)       ;output_height
   640 .loop:
   641     movdqu      xmm0,   [rsi - 3]           ;load src
   643     movdqa      xmm1, xmm0
   644     movdqa      xmm6, xmm0
   645     movdqa      xmm7, xmm0
   646     movdqa      xmm2, xmm0
   647     movdqa      xmm5, xmm0
   648     movdqa      xmm3, xmm0
   649     movdqa      xmm4, xmm0
   651     psrldq      xmm1, 1
   652     psrldq      xmm6, 6
   653     psrldq      xmm7, 7
   654     psrldq      xmm2, 2
   655     psrldq      xmm5, 5
   656     psrldq      xmm3, 3
   657     psrldq      xmm4, 4
   659     APPLY_FILTER_8 0, 0
   661     lea         rsi, [rsi + rax]
   662     lea         rdi, [rdi + rdx]
   663     dec         rcx
   664     jnz         .loop
   666     add rsp, 16 * 10
   667     pop rsp
   669     ; begin epilog
   670     pop rdi
   671     pop rsi
   672     RESTORE_XMM
   673     UNSHADOW_ARGS
   674     pop         rbp
   675     ret
   677 ;void vp9_filter_block1d16_h8_sse2
   678 ;(
   679 ;    unsigned char  *src_ptr,
   680 ;    unsigned int    src_pixels_per_line,
   681 ;    unsigned char  *output_ptr,
   682 ;    unsigned int    output_pitch,
   683 ;    unsigned int    output_height,
   684 ;    short *filter
   685 ;)
   686 global sym(vp9_filter_block1d16_h8_sse2) PRIVATE
   687 sym(vp9_filter_block1d16_h8_sse2):
   688     push        rbp
   689     mov         rbp, rsp
   690     SHADOW_ARGS_TO_STACK 6
   691     SAVE_XMM 7
   692     push        rsi
   693     push        rdi
   694     ; end prolog
   696     ALIGN_STACK 16, rax
   697     sub         rsp, 16 * 10
   698     %define k0 [rsp + 16 * 0]
   699     %define k1 [rsp + 16 * 1]
   700     %define k2 [rsp + 16 * 2]
   701     %define k3 [rsp + 16 * 3]
   702     %define k4 [rsp + 16 * 4]
   703     %define k5 [rsp + 16 * 5]
   704     %define k6 [rsp + 16 * 6]
   705     %define k7 [rsp + 16 * 7]
   706     %define krd [rsp + 16 * 8]
   707     %define zero [rsp + 16 * 9]
   709     GET_FILTERS
   711     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
   712     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
   713     movsxd      rcx, DWORD PTR arg(4)       ;output_height
   715 .loop:
   716     movdqu      xmm0,   [rsi - 3]           ;load src
   718     movdqa      xmm1, xmm0
   719     movdqa      xmm6, xmm0
   720     movdqa      xmm7, xmm0
   721     movdqa      xmm2, xmm0
   722     movdqa      xmm5, xmm0
   723     movdqa      xmm3, xmm0
   724     movdqa      xmm4, xmm0
   726     psrldq      xmm1, 1
   727     psrldq      xmm6, 6
   728     psrldq      xmm7, 7
   729     psrldq      xmm2, 2
   730     psrldq      xmm5, 5
   731     psrldq      xmm3, 3
   732     psrldq      xmm4, 4
   734     APPLY_FILTER_8 0, 0
   736     movdqu      xmm0,   [rsi + 5]           ;load src
   738     movdqa      xmm1, xmm0
   739     movdqa      xmm6, xmm0
   740     movdqa      xmm7, xmm0
   741     movdqa      xmm2, xmm0
   742     movdqa      xmm5, xmm0
   743     movdqa      xmm3, xmm0
   744     movdqa      xmm4, xmm0
   746     psrldq      xmm1, 1
   747     psrldq      xmm6, 6
   748     psrldq      xmm7, 7
   749     psrldq      xmm2, 2
   750     psrldq      xmm5, 5
   751     psrldq      xmm3, 3
   752     psrldq      xmm4, 4
   754     APPLY_FILTER_8 0, 8
   756     lea         rsi, [rsi + rax]
   757     lea         rdi, [rdi + rdx]
   758     dec         rcx
   759     jnz         .loop
   761     add rsp, 16 * 10
   762     pop rsp
   764     ; begin epilog
   765     pop rdi
   766     pop rsi
   767     RESTORE_XMM
   768     UNSHADOW_ARGS
   769     pop         rbp
   770     ret
   772 global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE
   773 sym(vp9_filter_block1d4_h8_avg_sse2):
   774     push        rbp
   775     mov         rbp, rsp
   776     SHADOW_ARGS_TO_STACK 6
   777     SAVE_XMM 7
   778     push        rsi
   779     push        rdi
   780     ; end prolog
   782     ALIGN_STACK 16, rax
   783     sub         rsp, 16 * 6
   784     %define k0k1 [rsp + 16 * 0]
   785     %define k2k3 [rsp + 16 * 1]
   786     %define k5k4 [rsp + 16 * 2]
   787     %define k6k7 [rsp + 16 * 3]
   788     %define krd [rsp + 16 * 4]
   789     %define zero [rsp + 16 * 5]
   791     GET_FILTERS_4
   793     mov         rsi, arg(0)                 ;src_ptr
   794     mov         rdi, arg(2)                 ;output_ptr
   796     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
   797     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
   798     movsxd      rcx, DWORD PTR arg(4)       ;output_height
   800 .loop:
   801     movdqu      xmm0,   [rsi - 3]           ;load src
   803     movdqa      xmm1, xmm0
   804     movdqa      xmm6, xmm0
   805     movdqa      xmm7, xmm0
   806     movdqa      xmm2, xmm0
   807     movdqa      xmm3, xmm0
   808     movdqa      xmm5, xmm0
   809     movdqa      xmm4, xmm0
   811     psrldq      xmm1, 1
   812     psrldq      xmm6, 6
   813     psrldq      xmm7, 7
   814     psrldq      xmm2, 2
   815     psrldq      xmm3, 3
   816     psrldq      xmm5, 5
   817     psrldq      xmm4, 4
   819     APPLY_FILTER_4 1
   821     lea         rsi, [rsi + rax]
   822     lea         rdi, [rdi + rdx]
   823     dec         rcx
   824     jnz         .loop
   826     add rsp, 16 * 6
   827     pop rsp
   829     ; begin epilog
   830     pop rdi
   831     pop rsi
   832     RESTORE_XMM
   833     UNSHADOW_ARGS
   834     pop         rbp
   835     ret
   837 global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE
   838 sym(vp9_filter_block1d8_h8_avg_sse2):
   839     push        rbp
   840     mov         rbp, rsp
   841     SHADOW_ARGS_TO_STACK 6
   842     SAVE_XMM 7
   843     push        rsi
   844     push        rdi
   845     ; end prolog
   847     ALIGN_STACK 16, rax
   848     sub         rsp, 16 * 10
   849     %define k0 [rsp + 16 * 0]
   850     %define k1 [rsp + 16 * 1]
   851     %define k2 [rsp + 16 * 2]
   852     %define k3 [rsp + 16 * 3]
   853     %define k4 [rsp + 16 * 4]
   854     %define k5 [rsp + 16 * 5]
   855     %define k6 [rsp + 16 * 6]
   856     %define k7 [rsp + 16 * 7]
   857     %define krd [rsp + 16 * 8]
   858     %define zero [rsp + 16 * 9]
   860     GET_FILTERS
   862     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
   863     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
   864     movsxd      rcx, DWORD PTR arg(4)       ;output_height
   866 .loop:
   867     movdqu      xmm0,   [rsi - 3]           ;load src
   869     movdqa      xmm1, xmm0
   870     movdqa      xmm6, xmm0
   871     movdqa      xmm7, xmm0
   872     movdqa      xmm2, xmm0
   873     movdqa      xmm5, xmm0
   874     movdqa      xmm3, xmm0
   875     movdqa      xmm4, xmm0
   877     psrldq      xmm1, 1
   878     psrldq      xmm6, 6
   879     psrldq      xmm7, 7
   880     psrldq      xmm2, 2
   881     psrldq      xmm5, 5
   882     psrldq      xmm3, 3
   883     psrldq      xmm4, 4
   885     APPLY_FILTER_8 1, 0
   887     lea         rsi, [rsi + rax]
   888     lea         rdi, [rdi + rdx]
   889     dec         rcx
   890     jnz         .loop
   892     add rsp, 16 * 10
   893     pop rsp
   895     ; begin epilog
   896     pop rdi
   897     pop rsi
   898     RESTORE_XMM
   899     UNSHADOW_ARGS
   900     pop         rbp
   901     ret
   903 global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE
   904 sym(vp9_filter_block1d16_h8_avg_sse2):
   905     push        rbp
   906     mov         rbp, rsp
   907     SHADOW_ARGS_TO_STACK 6
   908     SAVE_XMM 7
   909     push        rsi
   910     push        rdi
   911     ; end prolog
   913     ALIGN_STACK 16, rax
   914     sub         rsp, 16 * 10
   915     %define k0 [rsp + 16 * 0]
   916     %define k1 [rsp + 16 * 1]
   917     %define k2 [rsp + 16 * 2]
   918     %define k3 [rsp + 16 * 3]
   919     %define k4 [rsp + 16 * 4]
   920     %define k5 [rsp + 16 * 5]
   921     %define k6 [rsp + 16 * 6]
   922     %define k7 [rsp + 16 * 7]
   923     %define krd [rsp + 16 * 8]
   924     %define zero [rsp + 16 * 9]
   926     GET_FILTERS
   928     movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
   929     movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
   930     movsxd      rcx, DWORD PTR arg(4)       ;output_height
   932 .loop:
   933     movdqu      xmm0,   [rsi - 3]           ;load src
   935     movdqa      xmm1, xmm0
   936     movdqa      xmm6, xmm0
   937     movdqa      xmm7, xmm0
   938     movdqa      xmm2, xmm0
   939     movdqa      xmm5, xmm0
   940     movdqa      xmm3, xmm0
   941     movdqa      xmm4, xmm0
   943     psrldq      xmm1, 1
   944     psrldq      xmm6, 6
   945     psrldq      xmm7, 7
   946     psrldq      xmm2, 2
   947     psrldq      xmm5, 5
   948     psrldq      xmm3, 3
   949     psrldq      xmm4, 4
   951     APPLY_FILTER_8 1, 0
   953     movdqu      xmm0,   [rsi + 5]           ;load src
   955     movdqa      xmm1, xmm0
   956     movdqa      xmm6, xmm0
   957     movdqa      xmm7, xmm0
   958     movdqa      xmm2, xmm0
   959     movdqa      xmm5, xmm0
   960     movdqa      xmm3, xmm0
   961     movdqa      xmm4, xmm0
   963     psrldq      xmm1, 1
   964     psrldq      xmm6, 6
   965     psrldq      xmm7, 7
   966     psrldq      xmm2, 2
   967     psrldq      xmm5, 5
   968     psrldq      xmm3, 3
   969     psrldq      xmm4, 4
   971     APPLY_FILTER_8 1, 8
   973     lea         rsi, [rsi + rax]
   974     lea         rdi, [rdi + rdx]
   975     dec         rcx
   976     jnz         .loop
   978     add rsp, 16 * 10
   979     pop rsp
   981     ; begin epilog
   982     pop rdi
   983     pop rsi
   984     RESTORE_XMM
   985     UNSHADOW_ARGS
   986     pop         rbp
   987     ret

mercurial