media/libvpx/vp8/common/x86/subpixel_sse2.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12 %include "vpx_ports/x86_abi_support.asm"
    13 extern sym(vp8_bilinear_filters_x86_8)
    15 %define BLOCK_HEIGHT_WIDTH 4
    16 %define VP8_FILTER_WEIGHT 128
    17 %define VP8_FILTER_SHIFT  7
    20 ;/************************************************************************************
    21 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
    22 ; input pixel array has output_height rows. This routine assumes that output_height is an
    23 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
    24 ; rows each iteration to take advantage of the 128 bits operations.
    25 ;*************************************************************************************/
    26 ;void vp8_filter_block1d8_h6_sse2
    27 ;(
    28 ;    unsigned char  *src_ptr,
    29 ;    unsigned short *output_ptr,
    30 ;    unsigned int    src_pixels_per_line,
    31 ;    unsigned int    pixel_step,
    32 ;    unsigned int    output_height,
    33 ;    unsigned int    output_width,
    34 ;    short           *vp8_filter
    35 ;)
    36 global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
    37 sym(vp8_filter_block1d8_h6_sse2):
    38     push        rbp
    39     mov         rbp, rsp
    40     SHADOW_ARGS_TO_STACK 7
    41     SAVE_XMM 7
    42     GET_GOT     rbx
    43     push        rsi
    44     push        rdi
    45     ; end prolog
    47         mov         rdx,        arg(6) ;vp8_filter
    48         mov         rsi,        arg(0) ;src_ptr
    50         mov         rdi,        arg(1) ;output_ptr
    52         movsxd      rcx,        dword ptr arg(4) ;output_height
    53         movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
    54 %if ABI_IS_32BIT=0
    55         movsxd      r8,         dword ptr arg(5) ;output_width
    56 %endif
    57         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
    59 .filter_block1d8_h6_rowloop:
    60         movq        xmm3,       MMWORD PTR [rsi - 2]
    61         movq        xmm1,       MMWORD PTR [rsi + 6]
    63         prefetcht2  [rsi+rax-2]
    65         pslldq      xmm1,       8
    66         por         xmm1,       xmm3
    68         movdqa      xmm4,       xmm1
    69         movdqa      xmm5,       xmm1
    71         movdqa      xmm6,       xmm1
    72         movdqa      xmm7,       xmm1
    74         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    75         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
    77         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
    78         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
    80         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
    81         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
    84         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
    85         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
    87         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
    89         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
    90         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
    92         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
    94         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
    95         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
    98         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
   100         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
   101         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
   104         paddsw      xmm4,       xmm7
   105         paddsw      xmm4,       xmm5
   107         paddsw      xmm4,       xmm3
   108         paddsw      xmm4,       xmm6
   110         paddsw      xmm4,       xmm1
   111         paddsw      xmm4,       [GLOBAL(rd)]
   113         psraw       xmm4,       7
   115         packuswb    xmm4,       xmm0
   116         punpcklbw   xmm4,       xmm0
   118         movdqa      XMMWORD Ptr [rdi],         xmm4
   119         lea         rsi,        [rsi + rax]
   121 %if ABI_IS_32BIT
   122         add         rdi,        DWORD Ptr arg(5) ;[output_width]
   123 %else
   124         add         rdi,        r8
   125 %endif
   126         dec         rcx
   128         jnz         .filter_block1d8_h6_rowloop                ; next row
   130     ; begin epilog
   131     pop rdi
   132     pop rsi
   133     RESTORE_GOT
   134     RESTORE_XMM
   135     UNSHADOW_ARGS
   136     pop         rbp
   137     ret
   140 ;void vp8_filter_block1d16_h6_sse2
   141 ;(
   142 ;    unsigned char  *src_ptr,
   143 ;    unsigned short *output_ptr,
   144 ;    unsigned int    src_pixels_per_line,
   145 ;    unsigned int    pixel_step,
   146 ;    unsigned int    output_height,
   147 ;    unsigned int    output_width,
   148 ;    short           *vp8_filter
   149 ;)
   150 ;/************************************************************************************
   151 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
   152 ; input pixel array has output_height rows. This routine assumes that output_height is an
   153 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE
   154 ; rows each iteration to take advantage of the 128 bits operations.
   155 ;*************************************************************************************/
   156 global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
   157 sym(vp8_filter_block1d16_h6_sse2):
   158     push        rbp
   159     mov         rbp, rsp
   160     SHADOW_ARGS_TO_STACK 7
   161     SAVE_XMM 7
   162     GET_GOT     rbx
   163     push        rsi
   164     push        rdi
   165     ; end prolog
   167         mov         rdx,        arg(6) ;vp8_filter
   168         mov         rsi,        arg(0) ;src_ptr
   170         mov         rdi,        arg(1) ;output_ptr
   172         movsxd      rcx,        dword ptr arg(4) ;output_height
   173         movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
   174 %if ABI_IS_32BIT=0
   175         movsxd      r8,         dword ptr arg(5) ;output_width
   176 %endif
   178         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
   180 .filter_block1d16_h6_sse2_rowloop:
   181         movq        xmm3,       MMWORD PTR [rsi - 2]
   182         movq        xmm1,       MMWORD PTR [rsi + 6]
   184         movq        xmm2,       MMWORD PTR [rsi +14]
   185         pslldq      xmm2,       8
   187         por         xmm2,       xmm1
   188         prefetcht2  [rsi+rax-2]
   190         pslldq      xmm1,       8
   191         por         xmm1,       xmm3
   193         movdqa      xmm4,       xmm1
   194         movdqa      xmm5,       xmm1
   196         movdqa      xmm6,       xmm1
   197         movdqa      xmm7,       xmm1
   199         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
   200         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
   202         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
   203         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
   205         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
   206         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
   209         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
   210         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
   212         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
   214         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
   215         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
   217         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
   219         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
   220         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
   223         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
   225         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
   226         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
   228         paddsw      xmm4,       xmm7
   229         paddsw      xmm4,       xmm5
   231         paddsw      xmm4,       xmm3
   232         paddsw      xmm4,       xmm6
   234         paddsw      xmm4,       xmm1
   235         paddsw      xmm4,       [GLOBAL(rd)]
   237         psraw       xmm4,       7
   239         packuswb    xmm4,       xmm0
   240         punpcklbw   xmm4,       xmm0
   242         movdqa      XMMWORD Ptr [rdi],         xmm4
   244         movdqa      xmm3,       xmm2
   245         movdqa      xmm4,       xmm2
   247         movdqa      xmm5,       xmm2
   248         movdqa      xmm6,       xmm2
   250         movdqa      xmm7,       xmm2
   252         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
   253         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
   255         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
   256         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
   258         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
   259         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
   262         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
   263         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
   265         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
   267         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
   268         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
   270         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
   272         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
   273         psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
   275         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
   277         punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
   278         pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
   281         paddsw      xmm4,       xmm7
   282         paddsw      xmm4,       xmm5
   284         paddsw      xmm4,       xmm3
   285         paddsw      xmm4,       xmm6
   287         paddsw      xmm4,       xmm2
   288         paddsw      xmm4,       [GLOBAL(rd)]
   290         psraw       xmm4,       7
   292         packuswb    xmm4,       xmm0
   293         punpcklbw   xmm4,       xmm0
   295         movdqa      XMMWORD Ptr [rdi+16],      xmm4
   297         lea         rsi,        [rsi + rax]
   298 %if ABI_IS_32BIT
   299         add         rdi,        DWORD Ptr arg(5) ;[output_width]
   300 %else
   301         add         rdi,        r8
   302 %endif
   304         dec         rcx
   305         jnz         .filter_block1d16_h6_sse2_rowloop                ; next row
   307     ; begin epilog
   308     pop rdi
   309     pop rsi
   310     RESTORE_GOT
   311     RESTORE_XMM
   312     UNSHADOW_ARGS
   313     pop         rbp
   314     ret
   317 ;void vp8_filter_block1d8_v6_sse2
   318 ;(
   319 ;    short *src_ptr,
   320 ;    unsigned char *output_ptr,
   321 ;    int dst_ptich,
   322 ;    unsigned int pixels_per_line,
   323 ;    unsigned int pixel_step,
   324 ;    unsigned int output_height,
   325 ;    unsigned int output_width,
   326 ;    short * vp8_filter
   327 ;)
   328 ;/************************************************************************************
   329 ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
   330 ; input pixel array has output_height rows.
   331 ;*************************************************************************************/
   332 global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
   333 sym(vp8_filter_block1d8_v6_sse2):
   334     push        rbp
   335     mov         rbp, rsp
   336     SHADOW_ARGS_TO_STACK 8
   337     SAVE_XMM 7
   338     GET_GOT     rbx
   339     push        rsi
   340     push        rdi
   341     ; end prolog
   343         mov         rax,        arg(7) ;vp8_filter
   344         movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
   346         mov         rdi,        arg(1) ;output_ptr
   347         mov         rsi,        arg(0) ;src_ptr
   349         sub         rsi,        rdx
   350         sub         rsi,        rdx
   352         movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
   353         pxor        xmm0,       xmm0                        ; clear xmm0
   355         movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
   356 %if ABI_IS_32BIT=0
   357         movsxd      r8,         dword ptr arg(2) ; dst_ptich
   358 %endif
   360 .vp8_filter_block1d8_v6_sse2_loop:
   361         movdqa      xmm1,       XMMWORD PTR [rsi]
   362         pmullw      xmm1,       [rax]
   364         movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
   365         pmullw      xmm2,       [rax + 16]
   367         movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
   368         pmullw      xmm3,       [rax + 32]
   370         movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
   371         pmullw      xmm5,       [rax + 64]
   373         add         rsi,        rdx
   374         movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
   376         pmullw      xmm4,       [rax + 48]
   377         movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
   379         pmullw      xmm6,       [rax + 80]
   381         paddsw      xmm2,       xmm5
   382         paddsw      xmm2,       xmm3
   384         paddsw      xmm2,       xmm1
   385         paddsw      xmm2,       xmm4
   387         paddsw      xmm2,       xmm6
   388         paddsw      xmm2,       xmm7
   390         psraw       xmm2,       7
   391         packuswb    xmm2,       xmm0              ; pack and saturate
   393         movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
   394 %if ABI_IS_32BIT
   395         add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
   396 %else
   397         add         rdi,        r8
   398 %endif
   399         dec         rcx         ; decrement count
   400         jnz         .vp8_filter_block1d8_v6_sse2_loop               ; next row
   402     ; begin epilog
   403     pop rdi
   404     pop rsi
   405     RESTORE_GOT
   406     RESTORE_XMM
   407     UNSHADOW_ARGS
   408     pop         rbp
   409     ret
   412 ;void vp8_filter_block1d16_v6_sse2
   413 ;(
   414 ;    unsigned short *src_ptr,
   415 ;    unsigned char *output_ptr,
   416 ;    int dst_ptich,
   417 ;    unsigned int pixels_per_line,
   418 ;    unsigned int pixel_step,
   419 ;    unsigned int output_height,
   420 ;    unsigned int output_width,
   421 ;    const short    *vp8_filter
   422 ;)
   423 ;/************************************************************************************
   424 ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
   425 ; input pixel array has output_height rows.
   426 ;*************************************************************************************/
   427 global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
   428 sym(vp8_filter_block1d16_v6_sse2):
   429     push        rbp
   430     mov         rbp, rsp
   431     SHADOW_ARGS_TO_STACK 8
   432     SAVE_XMM 7
   433     GET_GOT     rbx
   434     push        rsi
   435     push        rdi
   436     ; end prolog
   438         mov         rax,        arg(7) ;vp8_filter
   439         movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
   441         mov         rdi,        arg(1) ;output_ptr
   442         mov         rsi,        arg(0) ;src_ptr
   444         sub         rsi,        rdx
   445         sub         rsi,        rdx
   447         movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
   448 %if ABI_IS_32BIT=0
   449         movsxd      r8,         dword ptr arg(2) ; dst_ptich
   450 %endif
   452 .vp8_filter_block1d16_v6_sse2_loop:
   453 ; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
   454         movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
   455         movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
   456         pmullw      xmm1,       [rax + 16]
   457         pmullw      xmm2,       [rax + 16]
   459         movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5
   460         movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]
   461         pmullw      xmm3,       [rax + 64]
   462         pmullw      xmm4,       [rax + 64]
   464         movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3
   465         movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]
   466         pmullw      xmm5,       [rax + 32]
   467         pmullw      xmm6,       [rax + 32]
   469         movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1
   470         movdqa      xmm0,       XMMWORD PTR [rsi + 16]
   471         pmullw      xmm7,       [rax]
   472         pmullw      xmm0,       [rax]
   474         paddsw      xmm1,       xmm3
   475         paddsw      xmm2,       xmm4
   476         paddsw      xmm1,       xmm5
   477         paddsw      xmm2,       xmm6
   478         paddsw      xmm1,       xmm7
   479         paddsw      xmm2,       xmm0
   481         add         rsi,        rdx
   483         movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4
   484         movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]
   485         pmullw      xmm3,       [rax + 48]
   486         pmullw      xmm4,       [rax + 48]
   488         movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6
   489         movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]
   490         pmullw      xmm5,       [rax + 80]
   491         pmullw      xmm6,       [rax + 80]
   493         movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
   494         pxor        xmm0,       xmm0                        ; clear xmm0
   496         paddsw      xmm1,       xmm3
   497         paddsw      xmm2,       xmm4
   498         paddsw      xmm1,       xmm5
   499         paddsw      xmm2,       xmm6
   501         paddsw      xmm1,       xmm7
   502         paddsw      xmm2,       xmm7
   504         psraw       xmm1,       7
   505         psraw       xmm2,       7
   507         packuswb    xmm1,       xmm2              ; pack and saturate
   508         movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination
   509 %if ABI_IS_32BIT
   510         add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
   511 %else
   512         add         rdi,        r8
   513 %endif
   514         dec         rcx         ; decrement count
   515         jnz         .vp8_filter_block1d16_v6_sse2_loop              ; next row
   517     ; begin epilog
   518     pop rdi
   519     pop rsi
   520     RESTORE_GOT
   521     RESTORE_XMM
   522     UNSHADOW_ARGS
   523     pop         rbp
   524     ret
   527 ;void vp8_filter_block1d8_h6_only_sse2
   528 ;(
   529 ;    unsigned char  *src_ptr,
   530 ;    unsigned int    src_pixels_per_line,
   531 ;    unsigned char  *output_ptr,
   532 ;    int dst_ptich,
   533 ;    unsigned int    output_height,
   534 ;    const short    *vp8_filter
   535 ;)
   536 ; First-pass filter only when yoffset==0
   537 global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
   538 sym(vp8_filter_block1d8_h6_only_sse2):
   539     push        rbp
   540     mov         rbp, rsp
   541     SHADOW_ARGS_TO_STACK 6
   542     SAVE_XMM 7
   543     GET_GOT     rbx
   544     push        rsi
   545     push        rdi
   546     ; end prolog
   548         mov         rdx,        arg(5) ;vp8_filter
   549         mov         rsi,        arg(0) ;src_ptr
   551         mov         rdi,        arg(2) ;output_ptr
   553         movsxd      rcx,        dword ptr arg(4) ;output_height
   554         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
   555 %if ABI_IS_32BIT=0
   556         movsxd      r8,         dword ptr arg(3) ;dst_ptich
   557 %endif
   558         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
   560 .filter_block1d8_h6_only_rowloop:
   561         movq        xmm3,       MMWORD PTR [rsi - 2]
   562         movq        xmm1,       MMWORD PTR [rsi + 6]
   564         prefetcht2  [rsi+rax-2]
   566         pslldq      xmm1,       8
   567         por         xmm1,       xmm3
   569         movdqa      xmm4,       xmm1
   570         movdqa      xmm5,       xmm1
   572         movdqa      xmm6,       xmm1
   573         movdqa      xmm7,       xmm1
   575         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
   576         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
   578         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
   579         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
   581         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
   582         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
   585         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
   586         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
   588         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
   590         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
   591         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
   593         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
   595         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
   596         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
   599         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
   601         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
   602         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
   605         paddsw      xmm4,       xmm7
   606         paddsw      xmm4,       xmm5
   608         paddsw      xmm4,       xmm3
   609         paddsw      xmm4,       xmm6
   611         paddsw      xmm4,       xmm1
   612         paddsw      xmm4,       [GLOBAL(rd)]
   614         psraw       xmm4,       7
   616         packuswb    xmm4,       xmm0
   618         movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination
   619         lea         rsi,        [rsi + rax]
   621 %if ABI_IS_32BIT
   622         add         rdi,        DWORD Ptr arg(3) ;dst_ptich
   623 %else
   624         add         rdi,        r8
   625 %endif
   626         dec         rcx
   628         jnz         .filter_block1d8_h6_only_rowloop               ; next row
   630     ; begin epilog
   631     pop rdi
   632     pop rsi
   633     RESTORE_GOT
   634     RESTORE_XMM
   635     UNSHADOW_ARGS
   636     pop         rbp
   637     ret
   640 ;void vp8_filter_block1d16_h6_only_sse2
   641 ;(
   642 ;    unsigned char  *src_ptr,
   643 ;    unsigned int    src_pixels_per_line,
   644 ;    unsigned char  *output_ptr,
   645 ;    int dst_ptich,
   646 ;    unsigned int    output_height,
   647 ;    const short    *vp8_filter
   648 ;)
   649 ; First-pass filter only when yoffset==0
   650 global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
   651 sym(vp8_filter_block1d16_h6_only_sse2):
   652     push        rbp
   653     mov         rbp, rsp
   654     SHADOW_ARGS_TO_STACK 6
   655     SAVE_XMM 7
   656     GET_GOT     rbx
   657     push        rsi
   658     push        rdi
   659     ; end prolog
   661         mov         rdx,        arg(5) ;vp8_filter
   662         mov         rsi,        arg(0) ;src_ptr
   664         mov         rdi,        arg(2) ;output_ptr
   666         movsxd      rcx,        dword ptr arg(4) ;output_height
   667         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
   668 %if ABI_IS_32BIT=0
   669         movsxd      r8,         dword ptr arg(3) ;dst_ptich
   670 %endif
   672         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
   674 .filter_block1d16_h6_only_sse2_rowloop:
   675         movq        xmm3,       MMWORD PTR [rsi - 2]
   676         movq        xmm1,       MMWORD PTR [rsi + 6]
   678         movq        xmm2,       MMWORD PTR [rsi +14]
   679         pslldq      xmm2,       8
   681         por         xmm2,       xmm1
   682         prefetcht2  [rsi+rax-2]
   684         pslldq      xmm1,       8
   685         por         xmm1,       xmm3
   687         movdqa      xmm4,       xmm1
   688         movdqa      xmm5,       xmm1
   690         movdqa      xmm6,       xmm1
   691         movdqa      xmm7,       xmm1
   693         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
   694         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
   696         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
   697         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
   699         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
   700         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
   702         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
   703         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
   705         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
   707         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
   708         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
   710         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
   712         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
   713         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
   715         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
   717         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
   718         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
   720         paddsw      xmm4,       xmm7
   721         paddsw      xmm4,       xmm5
   723         paddsw      xmm4,       xmm3
   724         paddsw      xmm4,       xmm6
   726         paddsw      xmm4,       xmm1
   727         paddsw      xmm4,       [GLOBAL(rd)]
   729         psraw       xmm4,       7
   731         packuswb    xmm4,       xmm0                        ; lower 8 bytes
   733         movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination
   735         movdqa      xmm3,       xmm2
   736         movdqa      xmm4,       xmm2
   738         movdqa      xmm5,       xmm2
   739         movdqa      xmm6,       xmm2
   741         movdqa      xmm7,       xmm2
   743         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
   744         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
   746         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
   747         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
   749         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
   750         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
   752         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
   753         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
   755         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
   757         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
   758         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
   760         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
   762         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
   763         psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
   765         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
   767         punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
   768         pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
   770         paddsw      xmm4,       xmm7
   771         paddsw      xmm4,       xmm5
   773         paddsw      xmm4,       xmm3
   774         paddsw      xmm4,       xmm6
   776         paddsw      xmm4,       xmm2
   777         paddsw      xmm4,       [GLOBAL(rd)]
   779         psraw       xmm4,       7
   781         packuswb    xmm4,       xmm0                        ; higher 8 bytes
   783         movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination
   785         lea         rsi,        [rsi + rax]
   786 %if ABI_IS_32BIT
   787         add         rdi,        DWORD Ptr arg(3) ;dst_ptich
   788 %else
   789         add         rdi,        r8
   790 %endif
   792         dec         rcx
   793         jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row
   795     ; begin epilog
   796     pop rdi
   797     pop rsi
   798     RESTORE_GOT
   799     RESTORE_XMM
   800     UNSHADOW_ARGS
   801     pop         rbp
   802     ret
   805 ;void vp8_filter_block1d8_v6_only_sse2
   806 ;(
   807 ;    unsigned char *src_ptr,
   808 ;    unsigned int    src_pixels_per_line,
   809 ;    unsigned char *output_ptr,
   810 ;    int dst_ptich,
   811 ;    unsigned int output_height,
   812 ;    const short    *vp8_filter
   813 ;)
   814 ; Second-pass filter only when xoffset==0
   815 global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
   816 sym(vp8_filter_block1d8_v6_only_sse2):
   817     push        rbp
   818     mov         rbp, rsp
   819     SHADOW_ARGS_TO_STACK 6
   820     SAVE_XMM 7
   821     GET_GOT     rbx
   822     push        rsi
   823     push        rdi
   824     ; end prolog
   826         mov         rsi,        arg(0) ;src_ptr
   827         mov         rdi,        arg(2) ;output_ptr
   829         movsxd      rcx,        dword ptr arg(4) ;output_height
   830         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
   832         mov         rax,        arg(5) ;vp8_filter
   834         pxor        xmm0,       xmm0                        ; clear xmm0
   836         movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
   837 %if ABI_IS_32BIT=0
   838         movsxd      r8,         dword ptr arg(3) ; dst_ptich
   839 %endif
   841 .vp8_filter_block1d8_v6_only_sse2_loop:
   842         movq        xmm1,       MMWORD PTR [rsi]
   843         movq        xmm2,       MMWORD PTR [rsi + rdx]
   844         movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
   845         movq        xmm5,       MMWORD PTR [rsi + rdx * 4]
   846         add         rsi,        rdx
   847         movq        xmm4,       MMWORD PTR [rsi + rdx * 2]
   848         movq        xmm6,       MMWORD PTR [rsi + rdx * 4]
   850         punpcklbw   xmm1,       xmm0
   851         pmullw      xmm1,       [rax]
   853         punpcklbw   xmm2,       xmm0
   854         pmullw      xmm2,       [rax + 16]
   856         punpcklbw   xmm3,       xmm0
   857         pmullw      xmm3,       [rax + 32]
   859         punpcklbw   xmm5,       xmm0
   860         pmullw      xmm5,       [rax + 64]
   862         punpcklbw   xmm4,       xmm0
   863         pmullw      xmm4,       [rax + 48]
   865         punpcklbw   xmm6,       xmm0
   866         pmullw      xmm6,       [rax + 80]
   868         paddsw      xmm2,       xmm5
   869         paddsw      xmm2,       xmm3
   871         paddsw      xmm2,       xmm1
   872         paddsw      xmm2,       xmm4
   874         paddsw      xmm2,       xmm6
   875         paddsw      xmm2,       xmm7
   877         psraw       xmm2,       7
   878         packuswb    xmm2,       xmm0              ; pack and saturate
   880         movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
   881 %if ABI_IS_32BIT
   882         add         rdi,        DWORD PTR arg(3) ;[dst_ptich]
   883 %else
   884         add         rdi,        r8
   885 %endif
   886         dec         rcx         ; decrement count
   887         jnz         .vp8_filter_block1d8_v6_only_sse2_loop              ; next row
   889     ; begin epilog
   890     pop rdi
   891     pop rsi
   892     RESTORE_GOT
   893     RESTORE_XMM
   894     UNSHADOW_ARGS
   895     pop         rbp
   896     ret
   899 ;void vp8_unpack_block1d16_h6_sse2
   900 ;(
   901 ;    unsigned char  *src_ptr,
   902 ;    unsigned short *output_ptr,
   903 ;    unsigned int    src_pixels_per_line,
   904 ;    unsigned int    output_height,
   905 ;    unsigned int    output_width
   906 ;)
   907 global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
   908 sym(vp8_unpack_block1d16_h6_sse2):
   909     push        rbp
   910     mov         rbp, rsp
   911     SHADOW_ARGS_TO_STACK 5
   912     GET_GOT     rbx
   913     push        rsi
   914     push        rdi
   915     ; end prolog
   917         mov         rsi,        arg(0) ;src_ptr
   918         mov         rdi,        arg(1) ;output_ptr
   920         movsxd      rcx,        dword ptr arg(3) ;output_height
   921         movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
   923         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
   924 %if ABI_IS_32BIT=0
   925         movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
   926 %endif
   928 .unpack_block1d16_h6_sse2_rowloop:
   929         movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
   930         movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
   932         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
   933         punpcklbw   xmm1,       xmm0
   935         movdqa      XMMWORD Ptr [rdi],         xmm1
   936         movdqa      XMMWORD Ptr [rdi + 16],    xmm3
   938         lea         rsi,        [rsi + rax]
   939 %if ABI_IS_32BIT
   940         add         rdi,        DWORD Ptr arg(4) ;[output_width]
   941 %else
   942         add         rdi,        r8
   943 %endif
   944         dec         rcx
   945         jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row
   947     ; begin epilog
   948     pop rdi
   949     pop rsi
   950     RESTORE_GOT
   951     UNSHADOW_ARGS
   952     pop         rbp
   953     ret
   956 ;void vp8_bilinear_predict16x16_sse2
   957 ;(
   958 ;    unsigned char  *src_ptr,
   959 ;    int   src_pixels_per_line,
   960 ;    int  xoffset,
   961 ;    int  yoffset,
   962 ;    unsigned char *dst_ptr,
   963 ;    int dst_pitch
   964 ;)
   965 extern sym(vp8_bilinear_filters_x86_8)
   966 global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
   967 sym(vp8_bilinear_predict16x16_sse2):
   968     push        rbp
   969     mov         rbp, rsp
   970     SHADOW_ARGS_TO_STACK 6
   971     SAVE_XMM 7
   972     GET_GOT     rbx
   973     push        rsi
   974     push        rdi
   975     ; end prolog
   977     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
   978     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
   980         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
   981         movsxd      rax,        dword ptr arg(2) ;xoffset
   983         cmp         rax,        0      ;skip first_pass filter if xoffset=0
   984         je          .b16x16_sp_only
   986         shl         rax,        5
   987         add         rax,        rcx    ;HFilter
   989         mov         rdi,        arg(4) ;dst_ptr
   990         mov         rsi,        arg(0) ;src_ptr
   991         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
   993         movdqa      xmm1,       [rax]
   994         movdqa      xmm2,       [rax+16]
   996         movsxd      rax,        dword ptr arg(3) ;yoffset
   998         cmp         rax,        0      ;skip second_pass filter if yoffset=0
   999         je          .b16x16_fp_only
  1001         shl         rax,        5
  1002         add         rax,        rcx    ;VFilter
  1004         lea         rcx,        [rdi+rdx*8]
  1005         lea         rcx,        [rcx+rdx*8]
  1006         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
  1008         pxor        xmm0,       xmm0
  1010 %if ABI_IS_32BIT=0
  1011         movsxd      r8,         dword ptr arg(5) ;dst_pitch
  1012 %endif
  1013         ; get the first horizontal line done
  1014         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  1015         movdqa      xmm4,       xmm3                 ; make a copy of current line
  1017         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
  1018         punpckhbw   xmm4,       xmm0
  1020         pmullw      xmm3,       xmm1
  1021         pmullw      xmm4,       xmm1
  1023         movdqu      xmm5,       [rsi+1]
  1024         movdqa      xmm6,       xmm5
  1026         punpcklbw   xmm5,       xmm0
  1027         punpckhbw   xmm6,       xmm0
  1029         pmullw      xmm5,       xmm2
  1030         pmullw      xmm6,       xmm2
  1032         paddw       xmm3,       xmm5
  1033         paddw       xmm4,       xmm6
  1035         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
  1036         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
  1038         paddw       xmm4,       [GLOBAL(rd)]
  1039         psraw       xmm4,       VP8_FILTER_SHIFT
  1041         movdqa      xmm7,       xmm3
  1042         packuswb    xmm7,       xmm4
  1044         add         rsi,        rdx                 ; next line
  1045 .next_row:
  1046         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  1047         movdqa      xmm4,       xmm3                 ; make a copy of current line
  1049         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
  1050         punpckhbw   xmm4,       xmm0
  1052         pmullw      xmm3,       xmm1
  1053         pmullw      xmm4,       xmm1
  1055         movdqu      xmm5,       [rsi+1]
  1056         movdqa      xmm6,       xmm5
  1058         punpcklbw   xmm5,       xmm0
  1059         punpckhbw   xmm6,       xmm0
  1061         pmullw      xmm5,       xmm2
  1062         pmullw      xmm6,       xmm2
  1064         paddw       xmm3,       xmm5
  1065         paddw       xmm4,       xmm6
  1067         movdqa      xmm5,       xmm7
  1068         movdqa      xmm6,       xmm7
  1070         punpcklbw   xmm5,       xmm0
  1071         punpckhbw   xmm6,       xmm0
  1073         pmullw      xmm5,       [rax]
  1074         pmullw      xmm6,       [rax]
  1076         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
  1077         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
  1079         paddw       xmm4,       [GLOBAL(rd)]
  1080         psraw       xmm4,       VP8_FILTER_SHIFT
  1082         movdqa      xmm7,       xmm3
  1083         packuswb    xmm7,       xmm4
  1085         pmullw      xmm3,       [rax+16]
  1086         pmullw      xmm4,       [rax+16]
  1088         paddw       xmm3,       xmm5
  1089         paddw       xmm4,       xmm6
  1091         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
  1092         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
  1094         paddw       xmm4,       [GLOBAL(rd)]
  1095         psraw       xmm4,       VP8_FILTER_SHIFT
  1097         packuswb    xmm3,       xmm4
  1098         movdqa      [rdi],      xmm3                 ; store the results in the destination
  1100         add         rsi,        rdx                 ; next line
  1101 %if ABI_IS_32BIT
  1102         add         rdi,        DWORD PTR arg(5) ;dst_pitch
  1103 %else
  1104         add         rdi,        r8
  1105 %endif
  1107         cmp         rdi,        rcx
  1108         jne         .next_row
  1110         jmp         .done
  1112 .b16x16_sp_only:
  1113         movsxd      rax,        dword ptr arg(3) ;yoffset
  1114         shl         rax,        5
  1115         add         rax,        rcx    ;VFilter
  1117         mov         rdi,        arg(4) ;dst_ptr
  1118         mov         rsi,        arg(0) ;src_ptr
  1119         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
  1121         movdqa      xmm1,       [rax]
  1122         movdqa      xmm2,       [rax+16]
  1124         lea         rcx,        [rdi+rdx*8]
  1125         lea         rcx,        [rcx+rdx*8]
  1126         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
  1128         pxor        xmm0,       xmm0
  1130         ; get the first horizontal line done
  1131         movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  1133         add         rsi,        rax                 ; next line
  1134 .next_row_spo:
  1135         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  1137         movdqa      xmm5,       xmm7
  1138         movdqa      xmm6,       xmm7
  1140         movdqa      xmm4,       xmm3                 ; make a copy of current line
  1141         movdqa      xmm7,       xmm3
  1143         punpcklbw   xmm5,       xmm0
  1144         punpckhbw   xmm6,       xmm0
  1145         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
  1146         punpckhbw   xmm4,       xmm0
  1148         pmullw      xmm5,       xmm1
  1149         pmullw      xmm6,       xmm1
  1150         pmullw      xmm3,       xmm2
  1151         pmullw      xmm4,       xmm2
  1153         paddw       xmm3,       xmm5
  1154         paddw       xmm4,       xmm6
  1156         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
  1157         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
  1159         paddw       xmm4,       [GLOBAL(rd)]
  1160         psraw       xmm4,       VP8_FILTER_SHIFT
  1162         packuswb    xmm3,       xmm4
  1163         movdqa      [rdi],      xmm3                 ; store the results in the destination
  1165         add         rsi,        rax                 ; next line
  1166         add         rdi,        rdx                 ;dst_pitch
  1167         cmp         rdi,        rcx
  1168         jne         .next_row_spo
  1170         jmp         .done
  1172 .b16x16_fp_only:
  1173         lea         rcx,        [rdi+rdx*8]
  1174         lea         rcx,        [rcx+rdx*8]
  1175         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
  1176         pxor        xmm0,       xmm0
  1178 .next_row_fpo:
  1179         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  1180         movdqa      xmm4,       xmm3                 ; make a copy of current line
  1182         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
  1183         punpckhbw   xmm4,       xmm0
  1185         pmullw      xmm3,       xmm1
  1186         pmullw      xmm4,       xmm1
  1188         movdqu      xmm5,       [rsi+1]
  1189         movdqa      xmm6,       xmm5
  1191         punpcklbw   xmm5,       xmm0
  1192         punpckhbw   xmm6,       xmm0
  1194         pmullw      xmm5,       xmm2
  1195         pmullw      xmm6,       xmm2
  1197         paddw       xmm3,       xmm5
  1198         paddw       xmm4,       xmm6
  1200         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
  1201         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
  1203         paddw       xmm4,       [GLOBAL(rd)]
  1204         psraw       xmm4,       VP8_FILTER_SHIFT
  1206         packuswb    xmm3,       xmm4
  1207         movdqa      [rdi],      xmm3                 ; store the results in the destination
  1209         add         rsi,        rax                 ; next line
  1210         add         rdi,        rdx                 ; dst_pitch
  1211         cmp         rdi,        rcx
  1212         jne         .next_row_fpo
  1214 .done:
  1215     ; begin epilog
  1216     pop rdi
  1217     pop rsi
  1218     RESTORE_GOT
  1219     RESTORE_XMM
  1220     UNSHADOW_ARGS
  1221     pop         rbp
  1222     ret
  1225 ;void vp8_bilinear_predict8x8_sse2
  1226 ;(
  1227 ;    unsigned char  *src_ptr,
  1228 ;    int   src_pixels_per_line,
  1229 ;    int  xoffset,
  1230 ;    int  yoffset,
  1231 ;    unsigned char *dst_ptr,
  1232 ;    int dst_pitch
  1233 ;)
  1234 global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
  1235 sym(vp8_bilinear_predict8x8_sse2):
  1236     push        rbp
  1237     mov         rbp, rsp
  1238     SHADOW_ARGS_TO_STACK 6
  1239     SAVE_XMM 7
  1240     GET_GOT     rbx
  1241     push        rsi
  1242     push        rdi
  1243     ; end prolog
  1245     ALIGN_STACK 16, rax
  1246     sub         rsp, 144                         ; reserve 144 bytes
  1248     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
  1249     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
  1250         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
  1252         mov         rsi,        arg(0) ;src_ptr
  1253         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
  1255     ;Read 9-line unaligned data in and put them on stack. This gives a big
  1256     ;performance boost.
  1257         movdqu      xmm0,       [rsi]
  1258         lea         rax,        [rdx + rdx*2]
  1259         movdqu      xmm1,       [rsi+rdx]
  1260         movdqu      xmm2,       [rsi+rdx*2]
  1261         add         rsi,        rax
  1262         movdqu      xmm3,       [rsi]
  1263         movdqu      xmm4,       [rsi+rdx]
  1264         movdqu      xmm5,       [rsi+rdx*2]
  1265         add         rsi,        rax
  1266         movdqu      xmm6,       [rsi]
  1267         movdqu      xmm7,       [rsi+rdx]
  1269         movdqa      XMMWORD PTR [rsp],            xmm0
  1271         movdqu      xmm0,       [rsi+rdx*2]
  1273         movdqa      XMMWORD PTR [rsp+16],         xmm1
  1274         movdqa      XMMWORD PTR [rsp+32],         xmm2
  1275         movdqa      XMMWORD PTR [rsp+48],         xmm3
  1276         movdqa      XMMWORD PTR [rsp+64],         xmm4
  1277         movdqa      XMMWORD PTR [rsp+80],         xmm5
  1278         movdqa      XMMWORD PTR [rsp+96],         xmm6
  1279         movdqa      XMMWORD PTR [rsp+112],        xmm7
  1280         movdqa      XMMWORD PTR [rsp+128],        xmm0
  1282         movsxd      rax,        dword ptr arg(2) ;xoffset
  1283         shl         rax,        5
  1284         add         rax,        rcx    ;HFilter
  1286         mov         rdi,        arg(4) ;dst_ptr
  1287         movsxd      rdx,        dword ptr arg(5) ;dst_pitch
  1289         movdqa      xmm1,       [rax]
  1290         movdqa      xmm2,       [rax+16]
  1292         movsxd      rax,        dword ptr arg(3) ;yoffset
  1293         shl         rax,        5
  1294         add         rax,        rcx    ;VFilter
  1296         lea         rcx,        [rdi+rdx*8]
  1298         movdqa      xmm5,       [rax]
  1299         movdqa      xmm6,       [rax+16]
  1301         pxor        xmm0,       xmm0
  1303         ; get the first horizontal line done
  1304         movdqa      xmm3,       XMMWORD PTR [rsp]
  1305         movdqa      xmm4,       xmm3                 ; make a copy of current line
  1306         psrldq      xmm4,       1
  1308         punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
  1309         punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
  1311         pmullw      xmm3,       xmm1
  1312         pmullw      xmm4,       xmm2
  1314         paddw       xmm3,       xmm4
  1316         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
  1317         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
  1319         movdqa      xmm7,       xmm3
  1320         add         rsp,        16                 ; next line
  1321 .next_row8x8:
  1322         movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
  1323         movdqa      xmm4,       xmm3                 ; make a copy of current line
  1324         psrldq      xmm4,       1
  1326         punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
  1327         punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
  1329         pmullw      xmm3,       xmm1
  1330         pmullw      xmm4,       xmm2
  1332         paddw       xmm3,       xmm4
  1333         pmullw      xmm7,       xmm5
  1335         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
  1336         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
  1338         movdqa      xmm4,       xmm3
  1340         pmullw      xmm3,       xmm6
  1341         paddw       xmm3,       xmm7
  1343         movdqa      xmm7,       xmm4
  1345         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
  1346         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
  1348         packuswb    xmm3,       xmm0
  1349         movq        [rdi],      xmm3                 ; store the results in the destination
  1351         add         rsp,        16                 ; next line
  1352         add         rdi,        rdx
  1354         cmp         rdi,        rcx
  1355         jne         .next_row8x8
  1357     ;add rsp, 144
  1358     pop rsp
  1359     ; begin epilog
  1360     pop rdi
  1361     pop rsi
  1362     RESTORE_GOT
  1363     RESTORE_XMM
  1364     UNSHADOW_ARGS
  1365     pop         rbp
  1366     ret
  1369 SECTION_RODATA
  1370 align 16
  1371 rd:
  1372     times 8 dw 0x40

mercurial