The Tor Browser: media/libvpx/vp8/common/x86/subpixel

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

1 ;

     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.

3 ;

     4 ;  Use of this source code is governed by a BSD-style license

     5 ;  that can be found in the LICENSE file in the root of the source

     6 ;  tree. An additional intellectual property rights grant can be found

     7 ;  in the file PATENTS.  All contributing project authors may

     8 ;  be found in the AUTHORS file in the root of the source tree.

9 ;

    12 %include "vpx_ports/x86_abi_support.asm"

    13 extern sym(vp8_bilinear_filters_x86_8)

    15 %define BLOCK_HEIGHT_WIDTH 4

    16 %define VP8_FILTER_WEIGHT 128

    17 %define VP8_FILTER_SHIFT  7

    20 ;/************************************************************************************

    21 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The

    22 ; input pixel array has output_height rows. This routine assumes that output_height is an

    23 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE

    24 ; rows each iteration to take advantage of the 128 bits operations.

    25 ;*************************************************************************************/

    26 ;void vp8_filter_block1d8_h6_sse2

    27 ;(

    28 ;    unsigned char  *src_ptr,

    29 ;    unsigned short *output_ptr,

    30 ;    unsigned int    src_pixels_per_line,

    31 ;    unsigned int    pixel_step,

    32 ;    unsigned int    output_height,

    33 ;    unsigned int    output_width,

    34 ;    short           *vp8_filter

    35 ;)

    36 global sym(vp8_filter_block1d8_h6_sse2) PRIVATE

    37 sym(vp8_filter_block1d8_h6_sse2):

    38     push        rbp

    39     mov         rbp, rsp

    40     SHADOW_ARGS_TO_STACK 7

    41     SAVE_XMM 7

    42     GET_GOT     rbx

    43     push        rsi

    44     push        rdi

    45     ; end prolog

    47         mov         rdx,        arg(6) ;vp8_filter

    48         mov         rsi,        arg(0) ;src_ptr

    50         mov         rdi,        arg(1) ;output_ptr

    52         movsxd      rcx,        dword ptr arg(4) ;output_height

    53         movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source

    54 %if ABI_IS_32BIT=0

    55         movsxd      r8,         dword ptr arg(5) ;output_width

    56 %endif

    57         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

    59 .filter_block1d8_h6_rowloop:

    60         movq        xmm3,       MMWORD PTR [rsi - 2]

    61         movq        xmm1,       MMWORD PTR [rsi + 6]

    63         prefetcht2  [rsi+rax-2]

    65         pslldq      xmm1,       8

    66         por         xmm1,       xmm3

    68         movdqa      xmm4,       xmm1

    69         movdqa      xmm5,       xmm1

    71         movdqa      xmm6,       xmm1

    72         movdqa      xmm7,       xmm1

    74         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

    75         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

    77         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

    78         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

    80         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

    81         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

    84         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

    85         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

    87         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

    89         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

    90         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

    92         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

    94         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

    95         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

    98         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

   100         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

   101         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

   104         paddsw      xmm4,       xmm7

   105         paddsw      xmm4,       xmm5

   107         paddsw      xmm4,       xmm3

   108         paddsw      xmm4,       xmm6

   110         paddsw      xmm4,       xmm1

   111         paddsw      xmm4,       [GLOBAL(rd)]

   113         psraw       xmm4,       7

   115         packuswb    xmm4,       xmm0

   116         punpcklbw   xmm4,       xmm0

   118         movdqa      XMMWORD Ptr [rdi],         xmm4

   119         lea         rsi,        [rsi + rax]

   121 %if ABI_IS_32BIT

   122         add         rdi,        DWORD Ptr arg(5) ;[output_width]

   123 %else

   124         add         rdi,        r8

   125 %endif

   126         dec         rcx

   128         jnz         .filter_block1d8_h6_rowloop                ; next row

   130     ; begin epilog

   131     pop rdi

   132     pop rsi

   133     RESTORE_GOT

   134     RESTORE_XMM

   135     UNSHADOW_ARGS

   136     pop         rbp

   137     ret

   140 ;void vp8_filter_block1d16_h6_sse2

   141 ;(

   142 ;    unsigned char  *src_ptr,

   143 ;    unsigned short *output_ptr,

   144 ;    unsigned int    src_pixels_per_line,

   145 ;    unsigned int    pixel_step,

   146 ;    unsigned int    output_height,

   147 ;    unsigned int    output_width,

   148 ;    short           *vp8_filter

   149 ;)

   150 ;/************************************************************************************

   151 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The

   152 ; input pixel array has output_height rows. This routine assumes that output_height is an

   153 ; even number. This function handles 8 pixels in horizontal direction, calculating ONE

   154 ; rows each iteration to take advantage of the 128 bits operations.

   155 ;*************************************************************************************/

   156 global sym(vp8_filter_block1d16_h6_sse2) PRIVATE

   157 sym(vp8_filter_block1d16_h6_sse2):

   158     push        rbp

   159     mov         rbp, rsp

   160     SHADOW_ARGS_TO_STACK 7

   161     SAVE_XMM 7

   162     GET_GOT     rbx

   163     push        rsi

   164     push        rdi

   165     ; end prolog

   167         mov         rdx,        arg(6) ;vp8_filter

   168         mov         rsi,        arg(0) ;src_ptr

   170         mov         rdi,        arg(1) ;output_ptr

   172         movsxd      rcx,        dword ptr arg(4) ;output_height

   173         movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source

   174 %if ABI_IS_32BIT=0

   175         movsxd      r8,         dword ptr arg(5) ;output_width

   176 %endif

   178         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

   180 .filter_block1d16_h6_sse2_rowloop:

   181         movq        xmm3,       MMWORD PTR [rsi - 2]

   182         movq        xmm1,       MMWORD PTR [rsi + 6]

   184         movq        xmm2,       MMWORD PTR [rsi +14]

   185         pslldq      xmm2,       8

   187         por         xmm2,       xmm1

   188         prefetcht2  [rsi+rax-2]

   190         pslldq      xmm1,       8

   191         por         xmm1,       xmm3

   193         movdqa      xmm4,       xmm1

   194         movdqa      xmm5,       xmm1

   196         movdqa      xmm6,       xmm1

   197         movdqa      xmm7,       xmm1

   199         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

   200         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

   202         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

   203         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

   205         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

   206         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

   209         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

   210         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

   212         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

   214         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

   215         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

   217         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

   219         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

   220         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

   223         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

   225         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

   226         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

   228         paddsw      xmm4,       xmm7

   229         paddsw      xmm4,       xmm5

   231         paddsw      xmm4,       xmm3

   232         paddsw      xmm4,       xmm6

   234         paddsw      xmm4,       xmm1

   235         paddsw      xmm4,       [GLOBAL(rd)]

   237         psraw       xmm4,       7

   239         packuswb    xmm4,       xmm0

   240         punpcklbw   xmm4,       xmm0

   242         movdqa      XMMWORD Ptr [rdi],         xmm4

   244         movdqa      xmm3,       xmm2

   245         movdqa      xmm4,       xmm2

   247         movdqa      xmm5,       xmm2

   248         movdqa      xmm6,       xmm2

   250         movdqa      xmm7,       xmm2

   252         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

   253         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

   255         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

   256         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

   258         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

   259         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

   262         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

   263         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

   265         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

   267         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

   268         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

   270         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

   272         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

   273         psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

   275         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

   277         punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

   278         pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

   281         paddsw      xmm4,       xmm7

   282         paddsw      xmm4,       xmm5

   284         paddsw      xmm4,       xmm3

   285         paddsw      xmm4,       xmm6

   287         paddsw      xmm4,       xmm2

   288         paddsw      xmm4,       [GLOBAL(rd)]

   290         psraw       xmm4,       7

   292         packuswb    xmm4,       xmm0

   293         punpcklbw   xmm4,       xmm0

   295         movdqa      XMMWORD Ptr [rdi+16],      xmm4

   297         lea         rsi,        [rsi + rax]

   298 %if ABI_IS_32BIT

   299         add         rdi,        DWORD Ptr arg(5) ;[output_width]

   300 %else

   301         add         rdi,        r8

   302 %endif

   304         dec         rcx

   305         jnz         .filter_block1d16_h6_sse2_rowloop                ; next row

   307     ; begin epilog

   308     pop rdi

   309     pop rsi

   310     RESTORE_GOT

   311     RESTORE_XMM

   312     UNSHADOW_ARGS

   313     pop         rbp

   314     ret

   317 ;void vp8_filter_block1d8_v6_sse2

   318 ;(

   319 ;    short *src_ptr,

   320 ;    unsigned char *output_ptr,

   321 ;    int dst_ptich,

   322 ;    unsigned int pixels_per_line,

   323 ;    unsigned int pixel_step,

   324 ;    unsigned int output_height,

   325 ;    unsigned int output_width,

   326 ;    short * vp8_filter

   327 ;)

   328 ;/************************************************************************************

   329 ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The

   330 ; input pixel array has output_height rows.

   331 ;*************************************************************************************/

   332 global sym(vp8_filter_block1d8_v6_sse2) PRIVATE

   333 sym(vp8_filter_block1d8_v6_sse2):

   334     push        rbp

   335     mov         rbp, rsp

   336     SHADOW_ARGS_TO_STACK 8

   337     SAVE_XMM 7

   338     GET_GOT     rbx

   339     push        rsi

   340     push        rdi

   341     ; end prolog

   343         mov         rax,        arg(7) ;vp8_filter

   344         movsxd      rdx,        dword ptr arg(3) ;pixels_per_line

   346         mov         rdi,        arg(1) ;output_ptr

   347         mov         rsi,        arg(0) ;src_ptr

   349         sub         rsi,        rdx

   350         sub         rsi,        rdx

   352         movsxd      rcx,        DWORD PTR arg(5) ;[output_height]

   353         pxor        xmm0,       xmm0                        ; clear xmm0

   355         movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]

   356 %if ABI_IS_32BIT=0

   357         movsxd      r8,         dword ptr arg(2) ; dst_ptich

   358 %endif

   360 .vp8_filter_block1d8_v6_sse2_loop:

   361         movdqa      xmm1,       XMMWORD PTR [rsi]

   362         pmullw      xmm1,       [rax]

   364         movdqa      xmm2,       XMMWORD PTR [rsi + rdx]

   365         pmullw      xmm2,       [rax + 16]

   367         movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]

   368         pmullw      xmm3,       [rax + 32]

   370         movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]

   371         pmullw      xmm5,       [rax + 64]

   373         add         rsi,        rdx

   374         movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]

   376         pmullw      xmm4,       [rax + 48]

   377         movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]

   379         pmullw      xmm6,       [rax + 80]

   381         paddsw      xmm2,       xmm5

   382         paddsw      xmm2,       xmm3

   384         paddsw      xmm2,       xmm1

   385         paddsw      xmm2,       xmm4

   387         paddsw      xmm2,       xmm6

   388         paddsw      xmm2,       xmm7

   390         psraw       xmm2,       7

   391         packuswb    xmm2,       xmm0              ; pack and saturate

   393         movq        QWORD PTR [rdi], xmm2         ; store the results in the destination

   394 %if ABI_IS_32BIT

   395         add         rdi,        DWORD PTR arg(2) ;[dst_ptich]

   396 %else

   397         add         rdi,        r8

   398 %endif

   399         dec         rcx         ; decrement count

   400         jnz         .vp8_filter_block1d8_v6_sse2_loop               ; next row

   402     ; begin epilog

   403     pop rdi

   404     pop rsi

   405     RESTORE_GOT

   406     RESTORE_XMM

   407     UNSHADOW_ARGS

   408     pop         rbp

   409     ret

   412 ;void vp8_filter_block1d16_v6_sse2

   413 ;(

   414 ;    unsigned short *src_ptr,

   415 ;    unsigned char *output_ptr,

   416 ;    int dst_ptich,

   417 ;    unsigned int pixels_per_line,

   418 ;    unsigned int pixel_step,

   419 ;    unsigned int output_height,

   420 ;    unsigned int output_width,

   421 ;    const short    *vp8_filter

   422 ;)

   423 ;/************************************************************************************

   424 ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The

   425 ; input pixel array has output_height rows.

   426 ;*************************************************************************************/

   427 global sym(vp8_filter_block1d16_v6_sse2) PRIVATE

   428 sym(vp8_filter_block1d16_v6_sse2):

   429     push        rbp

   430     mov         rbp, rsp

   431     SHADOW_ARGS_TO_STACK 8

   432     SAVE_XMM 7

   433     GET_GOT     rbx

   434     push        rsi

   435     push        rdi

   436     ; end prolog

   438         mov         rax,        arg(7) ;vp8_filter

   439         movsxd      rdx,        dword ptr arg(3) ;pixels_per_line

   441         mov         rdi,        arg(1) ;output_ptr

   442         mov         rsi,        arg(0) ;src_ptr

   444         sub         rsi,        rdx

   445         sub         rsi,        rdx

   447         movsxd      rcx,        DWORD PTR arg(5) ;[output_height]

   448 %if ABI_IS_32BIT=0

   449         movsxd      r8,         dword ptr arg(2) ; dst_ptich

   450 %endif

   452 .vp8_filter_block1d16_v6_sse2_loop:

   453 ; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.

   454         movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2

   455         movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]

   456         pmullw      xmm1,       [rax + 16]

   457         pmullw      xmm2,       [rax + 16]

   459         movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5

   460         movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]

   461         pmullw      xmm3,       [rax + 64]

   462         pmullw      xmm4,       [rax + 64]

   464         movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3

   465         movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]

   466         pmullw      xmm5,       [rax + 32]

   467         pmullw      xmm6,       [rax + 32]

   469         movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1

   470         movdqa      xmm0,       XMMWORD PTR [rsi + 16]

   471         pmullw      xmm7,       [rax]

   472         pmullw      xmm0,       [rax]

   474         paddsw      xmm1,       xmm3

   475         paddsw      xmm2,       xmm4

   476         paddsw      xmm1,       xmm5

   477         paddsw      xmm2,       xmm6

   478         paddsw      xmm1,       xmm7

   479         paddsw      xmm2,       xmm0

   481         add         rsi,        rdx

   483         movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4

   484         movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]

   485         pmullw      xmm3,       [rax + 48]

   486         pmullw      xmm4,       [rax + 48]

   488         movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6

   489         movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]

   490         pmullw      xmm5,       [rax + 80]

   491         pmullw      xmm6,       [rax + 80]

   493         movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]

   494         pxor        xmm0,       xmm0                        ; clear xmm0

   496         paddsw      xmm1,       xmm3

   497         paddsw      xmm2,       xmm4

   498         paddsw      xmm1,       xmm5

   499         paddsw      xmm2,       xmm6

   501         paddsw      xmm1,       xmm7

   502         paddsw      xmm2,       xmm7

   504         psraw       xmm1,       7

   505         psraw       xmm2,       7

   507         packuswb    xmm1,       xmm2              ; pack and saturate

   508         movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination

   509 %if ABI_IS_32BIT

   510         add         rdi,        DWORD PTR arg(2) ;[dst_ptich]

   511 %else

   512         add         rdi,        r8

   513 %endif

   514         dec         rcx         ; decrement count

   515         jnz         .vp8_filter_block1d16_v6_sse2_loop              ; next row

   517     ; begin epilog

   518     pop rdi

   519     pop rsi

   520     RESTORE_GOT

   521     RESTORE_XMM

   522     UNSHADOW_ARGS

   523     pop         rbp

   524     ret

   527 ;void vp8_filter_block1d8_h6_only_sse2

   528 ;(

   529 ;    unsigned char  *src_ptr,

   530 ;    unsigned int    src_pixels_per_line,

   531 ;    unsigned char  *output_ptr,

   532 ;    int dst_ptich,

   533 ;    unsigned int    output_height,

   534 ;    const short    *vp8_filter

   535 ;)

   536 ; First-pass filter only when yoffset==0

   537 global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE

   538 sym(vp8_filter_block1d8_h6_only_sse2):

   539     push        rbp

   540     mov         rbp, rsp

   541     SHADOW_ARGS_TO_STACK 6

   542     SAVE_XMM 7

   543     GET_GOT     rbx

   544     push        rsi

   545     push        rdi

   546     ; end prolog

   548         mov         rdx,        arg(5) ;vp8_filter

   549         mov         rsi,        arg(0) ;src_ptr

   551         mov         rdi,        arg(2) ;output_ptr

   553         movsxd      rcx,        dword ptr arg(4) ;output_height

   554         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source

   555 %if ABI_IS_32BIT=0

   556         movsxd      r8,         dword ptr arg(3) ;dst_ptich

   557 %endif

   558         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

   560 .filter_block1d8_h6_only_rowloop:

   561         movq        xmm3,       MMWORD PTR [rsi - 2]

   562         movq        xmm1,       MMWORD PTR [rsi + 6]

   564         prefetcht2  [rsi+rax-2]

   566         pslldq      xmm1,       8

   567         por         xmm1,       xmm3

   569         movdqa      xmm4,       xmm1

   570         movdqa      xmm5,       xmm1

   572         movdqa      xmm6,       xmm1

   573         movdqa      xmm7,       xmm1

   575         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

   576         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

   578         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

   579         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

   581         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

   582         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

   585         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

   586         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

   588         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

   590         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

   591         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

   593         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

   595         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

   596         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

   599         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

   601         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

   602         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

   605         paddsw      xmm4,       xmm7

   606         paddsw      xmm4,       xmm5

   608         paddsw      xmm4,       xmm3

   609         paddsw      xmm4,       xmm6

   611         paddsw      xmm4,       xmm1

   612         paddsw      xmm4,       [GLOBAL(rd)]

   614         psraw       xmm4,       7

   616         packuswb    xmm4,       xmm0

   618         movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination

   619         lea         rsi,        [rsi + rax]

   621 %if ABI_IS_32BIT

   622         add         rdi,        DWORD Ptr arg(3) ;dst_ptich

   623 %else

   624         add         rdi,        r8

   625 %endif

   626         dec         rcx

   628         jnz         .filter_block1d8_h6_only_rowloop               ; next row

   630     ; begin epilog

   631     pop rdi

   632     pop rsi

   633     RESTORE_GOT

   634     RESTORE_XMM

   635     UNSHADOW_ARGS

   636     pop         rbp

   637     ret

   640 ;void vp8_filter_block1d16_h6_only_sse2

   641 ;(

   642 ;    unsigned char  *src_ptr,

   643 ;    unsigned int    src_pixels_per_line,

   644 ;    unsigned char  *output_ptr,

   645 ;    int dst_ptich,

   646 ;    unsigned int    output_height,

   647 ;    const short    *vp8_filter

   648 ;)

   649 ; First-pass filter only when yoffset==0

   650 global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE

   651 sym(vp8_filter_block1d16_h6_only_sse2):

   652     push        rbp

   653     mov         rbp, rsp

   654     SHADOW_ARGS_TO_STACK 6

   655     SAVE_XMM 7

   656     GET_GOT     rbx

   657     push        rsi

   658     push        rdi

   659     ; end prolog

   661         mov         rdx,        arg(5) ;vp8_filter

   662         mov         rsi,        arg(0) ;src_ptr

   664         mov         rdi,        arg(2) ;output_ptr

   666         movsxd      rcx,        dword ptr arg(4) ;output_height

   667         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source

   668 %if ABI_IS_32BIT=0

   669         movsxd      r8,         dword ptr arg(3) ;dst_ptich

   670 %endif

   672         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

   674 .filter_block1d16_h6_only_sse2_rowloop:

   675         movq        xmm3,       MMWORD PTR [rsi - 2]

   676         movq        xmm1,       MMWORD PTR [rsi + 6]

   678         movq        xmm2,       MMWORD PTR [rsi +14]

   679         pslldq      xmm2,       8

   681         por         xmm2,       xmm1

   682         prefetcht2  [rsi+rax-2]

   684         pslldq      xmm1,       8

   685         por         xmm1,       xmm3

   687         movdqa      xmm4,       xmm1

   688         movdqa      xmm5,       xmm1

   690         movdqa      xmm6,       xmm1

   691         movdqa      xmm7,       xmm1

   693         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

   694         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

   696         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

   697         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

   699         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

   700         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

   702         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

   703         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

   705         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

   707         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

   708         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

   710         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

   712         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

   713         psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

   715         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

   717         punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

   718         pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

   720         paddsw      xmm4,       xmm7

   721         paddsw      xmm4,       xmm5

   723         paddsw      xmm4,       xmm3

   724         paddsw      xmm4,       xmm6

   726         paddsw      xmm4,       xmm1

   727         paddsw      xmm4,       [GLOBAL(rd)]

   729         psraw       xmm4,       7

   731         packuswb    xmm4,       xmm0                        ; lower 8 bytes

   733         movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination

   735         movdqa      xmm3,       xmm2

   736         movdqa      xmm4,       xmm2

   738         movdqa      xmm5,       xmm2

   739         movdqa      xmm6,       xmm2

   741         movdqa      xmm7,       xmm2

   743         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

   744         psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1

   746         pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1

   747         punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1

   749         psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00

   750         pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2

   752         punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00

   753         psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01

   755         pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3

   757         punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01

   758         psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02

   760         pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4

   762         punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02

   763         psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03

   765         pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5

   767         punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03

   768         pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6

   770         paddsw      xmm4,       xmm7

   771         paddsw      xmm4,       xmm5

   773         paddsw      xmm4,       xmm3

   774         paddsw      xmm4,       xmm6

   776         paddsw      xmm4,       xmm2

   777         paddsw      xmm4,       [GLOBAL(rd)]

   779         psraw       xmm4,       7

   781         packuswb    xmm4,       xmm0                        ; higher 8 bytes

   783         movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination

   785         lea         rsi,        [rsi + rax]

   786 %if ABI_IS_32BIT

   787         add         rdi,        DWORD Ptr arg(3) ;dst_ptich

   788 %else

   789         add         rdi,        r8

   790 %endif

   792         dec         rcx

   793         jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row

   795     ; begin epilog

   796     pop rdi

   797     pop rsi

   798     RESTORE_GOT

   799     RESTORE_XMM

   800     UNSHADOW_ARGS

   801     pop         rbp

   802     ret

   805 ;void vp8_filter_block1d8_v6_only_sse2

   806 ;(

   807 ;    unsigned char *src_ptr,

   808 ;    unsigned int    src_pixels_per_line,

   809 ;    unsigned char *output_ptr,

   810 ;    int dst_ptich,

   811 ;    unsigned int output_height,

   812 ;    const short    *vp8_filter

   813 ;)

   814 ; Second-pass filter only when xoffset==0

   815 global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE

   816 sym(vp8_filter_block1d8_v6_only_sse2):

   817     push        rbp

   818     mov         rbp, rsp

   819     SHADOW_ARGS_TO_STACK 6

   820     SAVE_XMM 7

   821     GET_GOT     rbx

   822     push        rsi

   823     push        rdi

   824     ; end prolog

   826         mov         rsi,        arg(0) ;src_ptr

   827         mov         rdi,        arg(2) ;output_ptr

   829         movsxd      rcx,        dword ptr arg(4) ;output_height

   830         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

   832         mov         rax,        arg(5) ;vp8_filter

   834         pxor        xmm0,       xmm0                        ; clear xmm0

   836         movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]

   837 %if ABI_IS_32BIT=0

   838         movsxd      r8,         dword ptr arg(3) ; dst_ptich

   839 %endif

   841 .vp8_filter_block1d8_v6_only_sse2_loop:

   842         movq        xmm1,       MMWORD PTR [rsi]

   843         movq        xmm2,       MMWORD PTR [rsi + rdx]

   844         movq        xmm3,       MMWORD PTR [rsi + rdx * 2]

   845         movq        xmm5,       MMWORD PTR [rsi + rdx * 4]

   846         add         rsi,        rdx

   847         movq        xmm4,       MMWORD PTR [rsi + rdx * 2]

   848         movq        xmm6,       MMWORD PTR [rsi + rdx * 4]

   850         punpcklbw   xmm1,       xmm0

   851         pmullw      xmm1,       [rax]

   853         punpcklbw   xmm2,       xmm0

   854         pmullw      xmm2,       [rax + 16]

   856         punpcklbw   xmm3,       xmm0

   857         pmullw      xmm3,       [rax + 32]

   859         punpcklbw   xmm5,       xmm0

   860         pmullw      xmm5,       [rax + 64]

   862         punpcklbw   xmm4,       xmm0

   863         pmullw      xmm4,       [rax + 48]

   865         punpcklbw   xmm6,       xmm0

   866         pmullw      xmm6,       [rax + 80]

   868         paddsw      xmm2,       xmm5

   869         paddsw      xmm2,       xmm3

   871         paddsw      xmm2,       xmm1

   872         paddsw      xmm2,       xmm4

   874         paddsw      xmm2,       xmm6

   875         paddsw      xmm2,       xmm7

   877         psraw       xmm2,       7

   878         packuswb    xmm2,       xmm0              ; pack and saturate

   880         movq        QWORD PTR [rdi], xmm2         ; store the results in the destination

   881 %if ABI_IS_32BIT

   882         add         rdi,        DWORD PTR arg(3) ;[dst_ptich]

   883 %else

   884         add         rdi,        r8

   885 %endif

   886         dec         rcx         ; decrement count

   887         jnz         .vp8_filter_block1d8_v6_only_sse2_loop              ; next row

   889     ; begin epilog

   890     pop rdi

   891     pop rsi

   892     RESTORE_GOT

   893     RESTORE_XMM

   894     UNSHADOW_ARGS

   895     pop         rbp

   896     ret

   899 ;void vp8_unpack_block1d16_h6_sse2

   900 ;(

   901 ;    unsigned char  *src_ptr,

   902 ;    unsigned short *output_ptr,

   903 ;    unsigned int    src_pixels_per_line,

   904 ;    unsigned int    output_height,

   905 ;    unsigned int    output_width

   906 ;)

   907 global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE

   908 sym(vp8_unpack_block1d16_h6_sse2):

   909     push        rbp

   910     mov         rbp, rsp

   911     SHADOW_ARGS_TO_STACK 5

   912     GET_GOT     rbx

   913     push        rsi

   914     push        rdi

   915     ; end prolog

   917         mov         rsi,        arg(0) ;src_ptr

   918         mov         rdi,        arg(1) ;output_ptr

   920         movsxd      rcx,        dword ptr arg(3) ;output_height

   921         movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source

   923         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack

   924 %if ABI_IS_32BIT=0

   925         movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source

   926 %endif

   928 .unpack_block1d16_h6_sse2_rowloop:

   929         movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2

   930         movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1

   932         punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2

   933         punpcklbw   xmm1,       xmm0

   935         movdqa      XMMWORD Ptr [rdi],         xmm1

   936         movdqa      XMMWORD Ptr [rdi + 16],    xmm3

   938         lea         rsi,        [rsi + rax]

   939 %if ABI_IS_32BIT

   940         add         rdi,        DWORD Ptr arg(4) ;[output_width]

   941 %else

   942         add         rdi,        r8

   943 %endif

   944         dec         rcx

   945         jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row

   947     ; begin epilog

   948     pop rdi

   949     pop rsi

   950     RESTORE_GOT

   951     UNSHADOW_ARGS

   952     pop         rbp

   953     ret

   956 ;void vp8_bilinear_predict16x16_sse2

   957 ;(

   958 ;    unsigned char  *src_ptr,

   959 ;    int   src_pixels_per_line,

   960 ;    int  xoffset,

   961 ;    int  yoffset,

   962 ;    unsigned char *dst_ptr,

   963 ;    int dst_pitch

   964 ;)

   965 extern sym(vp8_bilinear_filters_x86_8)

   966 global sym(vp8_bilinear_predict16x16_sse2) PRIVATE

   967 sym(vp8_bilinear_predict16x16_sse2):

   968     push        rbp

   969     mov         rbp, rsp

   970     SHADOW_ARGS_TO_STACK 6

   971     SAVE_XMM 7

   972     GET_GOT     rbx

   973     push        rsi

   974     push        rdi

   975     ; end prolog

   977     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]

   978     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]

   980         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]

   981         movsxd      rax,        dword ptr arg(2) ;xoffset

   983         cmp         rax,        0      ;skip first_pass filter if xoffset=0

   984         je          .b16x16_sp_only

   986         shl         rax,        5

   987         add         rax,        rcx    ;HFilter

   989         mov         rdi,        arg(4) ;dst_ptr

   990         mov         rsi,        arg(0) ;src_ptr

   991         movsxd      rdx,        dword ptr arg(5) ;dst_pitch

   993         movdqa      xmm1,       [rax]

   994         movdqa      xmm2,       [rax+16]

   996         movsxd      rax,        dword ptr arg(3) ;yoffset

   998         cmp         rax,        0      ;skip second_pass filter if yoffset=0

   999         je          .b16x16_fp_only

  1001         shl         rax,        5

  1002         add         rax,        rcx    ;VFilter

  1004         lea         rcx,        [rdi+rdx*8]

  1005         lea         rcx,        [rcx+rdx*8]

  1006         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

  1008         pxor        xmm0,       xmm0

  1010 %if ABI_IS_32BIT=0

  1011         movsxd      r8,         dword ptr arg(5) ;dst_pitch

  1012 %endif

  1013         ; get the first horizontal line done

  1014         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

  1015         movdqa      xmm4,       xmm3                 ; make a copy of current line

  1017         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

  1018         punpckhbw   xmm4,       xmm0

  1020         pmullw      xmm3,       xmm1

  1021         pmullw      xmm4,       xmm1

  1023         movdqu      xmm5,       [rsi+1]

  1024         movdqa      xmm6,       xmm5

  1026         punpcklbw   xmm5,       xmm0

  1027         punpckhbw   xmm6,       xmm0

  1029         pmullw      xmm5,       xmm2

  1030         pmullw      xmm6,       xmm2

  1032         paddw       xmm3,       xmm5

  1033         paddw       xmm4,       xmm6

  1035         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

  1036         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128

  1038         paddw       xmm4,       [GLOBAL(rd)]

  1039         psraw       xmm4,       VP8_FILTER_SHIFT

  1041         movdqa      xmm7,       xmm3

  1042         packuswb    xmm7,       xmm4

  1044         add         rsi,        rdx                 ; next line

  1045 .next_row:

  1046         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

  1047         movdqa      xmm4,       xmm3                 ; make a copy of current line

  1049         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

  1050         punpckhbw   xmm4,       xmm0

  1052         pmullw      xmm3,       xmm1

  1053         pmullw      xmm4,       xmm1

  1055         movdqu      xmm5,       [rsi+1]

  1056         movdqa      xmm6,       xmm5

  1058         punpcklbw   xmm5,       xmm0

  1059         punpckhbw   xmm6,       xmm0

  1061         pmullw      xmm5,       xmm2

  1062         pmullw      xmm6,       xmm2

  1064         paddw       xmm3,       xmm5

  1065         paddw       xmm4,       xmm6

  1067         movdqa      xmm5,       xmm7

  1068         movdqa      xmm6,       xmm7

  1070         punpcklbw   xmm5,       xmm0

  1071         punpckhbw   xmm6,       xmm0

  1073         pmullw      xmm5,       [rax]

  1074         pmullw      xmm6,       [rax]

  1076         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

  1077         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128

  1079         paddw       xmm4,       [GLOBAL(rd)]

  1080         psraw       xmm4,       VP8_FILTER_SHIFT

  1082         movdqa      xmm7,       xmm3

  1083         packuswb    xmm7,       xmm4

  1085         pmullw      xmm3,       [rax+16]

  1086         pmullw      xmm4,       [rax+16]

  1088         paddw       xmm3,       xmm5

  1089         paddw       xmm4,       xmm6

  1091         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

  1092         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128

  1094         paddw       xmm4,       [GLOBAL(rd)]

  1095         psraw       xmm4,       VP8_FILTER_SHIFT

  1097         packuswb    xmm3,       xmm4

  1098         movdqa      [rdi],      xmm3                 ; store the results in the destination

  1100         add         rsi,        rdx                 ; next line

  1101 %if ABI_IS_32BIT

  1102         add         rdi,        DWORD PTR arg(5) ;dst_pitch

  1103 %else

  1104         add         rdi,        r8

  1105 %endif

  1107         cmp         rdi,        rcx

  1108         jne         .next_row

  1110         jmp         .done

  1112 .b16x16_sp_only:

  1113         movsxd      rax,        dword ptr arg(3) ;yoffset

  1114         shl         rax,        5

  1115         add         rax,        rcx    ;VFilter

  1117         mov         rdi,        arg(4) ;dst_ptr

  1118         mov         rsi,        arg(0) ;src_ptr

  1119         movsxd      rdx,        dword ptr arg(5) ;dst_pitch

  1121         movdqa      xmm1,       [rax]

  1122         movdqa      xmm2,       [rax+16]

  1124         lea         rcx,        [rdi+rdx*8]

  1125         lea         rcx,        [rcx+rdx*8]

  1126         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line

  1128         pxor        xmm0,       xmm0

  1130         ; get the first horizontal line done

  1131         movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

  1133         add         rsi,        rax                 ; next line

  1134 .next_row_spo:

  1135         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

  1137         movdqa      xmm5,       xmm7

  1138         movdqa      xmm6,       xmm7

  1140         movdqa      xmm4,       xmm3                 ; make a copy of current line

  1141         movdqa      xmm7,       xmm3

  1143         punpcklbw   xmm5,       xmm0

  1144         punpckhbw   xmm6,       xmm0

  1145         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

  1146         punpckhbw   xmm4,       xmm0

  1148         pmullw      xmm5,       xmm1

  1149         pmullw      xmm6,       xmm1

  1150         pmullw      xmm3,       xmm2

  1151         pmullw      xmm4,       xmm2

  1153         paddw       xmm3,       xmm5

  1154         paddw       xmm4,       xmm6

  1156         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

  1157         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128

  1159         paddw       xmm4,       [GLOBAL(rd)]

  1160         psraw       xmm4,       VP8_FILTER_SHIFT

  1162         packuswb    xmm3,       xmm4

  1163         movdqa      [rdi],      xmm3                 ; store the results in the destination

  1165         add         rsi,        rax                 ; next line

  1166         add         rdi,        rdx                 ;dst_pitch

  1167         cmp         rdi,        rcx

  1168         jne         .next_row_spo

  1170         jmp         .done

  1172 .b16x16_fp_only:

  1173         lea         rcx,        [rdi+rdx*8]

  1174         lea         rcx,        [rcx+rdx*8]

  1175         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line

  1176         pxor        xmm0,       xmm0

  1178 .next_row_fpo:

  1179         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14

  1180         movdqa      xmm4,       xmm3                 ; make a copy of current line

  1182         punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06

  1183         punpckhbw   xmm4,       xmm0

  1185         pmullw      xmm3,       xmm1

  1186         pmullw      xmm4,       xmm1

  1188         movdqu      xmm5,       [rsi+1]

  1189         movdqa      xmm6,       xmm5

  1191         punpcklbw   xmm5,       xmm0

  1192         punpckhbw   xmm6,       xmm0

  1194         pmullw      xmm5,       xmm2

  1195         pmullw      xmm6,       xmm2

  1197         paddw       xmm3,       xmm5

  1198         paddw       xmm4,       xmm6

  1200         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

  1201         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128

  1203         paddw       xmm4,       [GLOBAL(rd)]

  1204         psraw       xmm4,       VP8_FILTER_SHIFT

  1206         packuswb    xmm3,       xmm4

  1207         movdqa      [rdi],      xmm3                 ; store the results in the destination

  1209         add         rsi,        rax                 ; next line

  1210         add         rdi,        rdx                 ; dst_pitch

  1211         cmp         rdi,        rcx

  1212         jne         .next_row_fpo

  1214 .done:

  1215     ; begin epilog

  1216     pop rdi

  1217     pop rsi

  1218     RESTORE_GOT

  1219     RESTORE_XMM

  1220     UNSHADOW_ARGS

  1221     pop         rbp

  1222     ret

  1225 ;void vp8_bilinear_predict8x8_sse2

  1226 ;(

  1227 ;    unsigned char  *src_ptr,

  1228 ;    int   src_pixels_per_line,

  1229 ;    int  xoffset,

  1230 ;    int  yoffset,

  1231 ;    unsigned char *dst_ptr,

  1232 ;    int dst_pitch

  1233 ;)

  1234 global sym(vp8_bilinear_predict8x8_sse2) PRIVATE

  1235 sym(vp8_bilinear_predict8x8_sse2):

  1236     push        rbp

  1237     mov         rbp, rsp

  1238     SHADOW_ARGS_TO_STACK 6

  1239     SAVE_XMM 7

  1240     GET_GOT     rbx

  1241     push        rsi

  1242     push        rdi

  1243     ; end prolog

  1245     ALIGN_STACK 16, rax

  1246     sub         rsp, 144                         ; reserve 144 bytes

  1248     ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]

  1249     ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]

  1250         lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]

  1252         mov         rsi,        arg(0) ;src_ptr

  1253         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

  1255     ;Read 9-line unaligned data in and put them on stack. This gives a big

  1256     ;performance boost.

  1257         movdqu      xmm0,       [rsi]

  1258         lea         rax,        [rdx + rdx*2]

  1259         movdqu      xmm1,       [rsi+rdx]

  1260         movdqu      xmm2,       [rsi+rdx*2]

  1261         add         rsi,        rax

  1262         movdqu      xmm3,       [rsi]

  1263         movdqu      xmm4,       [rsi+rdx]

  1264         movdqu      xmm5,       [rsi+rdx*2]

  1265         add         rsi,        rax

  1266         movdqu      xmm6,       [rsi]

  1267         movdqu      xmm7,       [rsi+rdx]

  1269         movdqa      XMMWORD PTR [rsp],            xmm0

  1271         movdqu      xmm0,       [rsi+rdx*2]

  1273         movdqa      XMMWORD PTR [rsp+16],         xmm1

  1274         movdqa      XMMWORD PTR [rsp+32],         xmm2

  1275         movdqa      XMMWORD PTR [rsp+48],         xmm3

  1276         movdqa      XMMWORD PTR [rsp+64],         xmm4

  1277         movdqa      XMMWORD PTR [rsp+80],         xmm5

  1278         movdqa      XMMWORD PTR [rsp+96],         xmm6

  1279         movdqa      XMMWORD PTR [rsp+112],        xmm7

  1280         movdqa      XMMWORD PTR [rsp+128],        xmm0

  1282         movsxd      rax,        dword ptr arg(2) ;xoffset

  1283         shl         rax,        5

  1284         add         rax,        rcx    ;HFilter

  1286         mov         rdi,        arg(4) ;dst_ptr

  1287         movsxd      rdx,        dword ptr arg(5) ;dst_pitch

  1289         movdqa      xmm1,       [rax]

  1290         movdqa      xmm2,       [rax+16]

  1292         movsxd      rax,        dword ptr arg(3) ;yoffset

  1293         shl         rax,        5

  1294         add         rax,        rcx    ;VFilter

  1296         lea         rcx,        [rdi+rdx*8]

  1298         movdqa      xmm5,       [rax]

  1299         movdqa      xmm6,       [rax+16]

  1301         pxor        xmm0,       xmm0

  1303         ; get the first horizontal line done

  1304         movdqa      xmm3,       XMMWORD PTR [rsp]

  1305         movdqa      xmm4,       xmm3                 ; make a copy of current line

  1306         psrldq      xmm4,       1

  1308         punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07

  1309         punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08

  1311         pmullw      xmm3,       xmm1

  1312         pmullw      xmm4,       xmm2

  1314         paddw       xmm3,       xmm4

  1316         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

  1317         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128

  1319         movdqa      xmm7,       xmm3

  1320         add         rsp,        16                 ; next line

  1321 .next_row8x8:

  1322         movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15

  1323         movdqa      xmm4,       xmm3                 ; make a copy of current line

  1324         psrldq      xmm4,       1

  1326         punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07

  1327         punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08

  1329         pmullw      xmm3,       xmm1

  1330         pmullw      xmm4,       xmm2

  1332         paddw       xmm3,       xmm4

  1333         pmullw      xmm7,       xmm5

  1335         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

  1336         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128

  1338         movdqa      xmm4,       xmm3

  1340         pmullw      xmm3,       xmm6

  1341         paddw       xmm3,       xmm7

  1343         movdqa      xmm7,       xmm4

  1345         paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value

  1346         psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128

  1348         packuswb    xmm3,       xmm0

  1349         movq        [rdi],      xmm3                 ; store the results in the destination

  1351         add         rsp,        16                 ; next line

  1352         add         rdi,        rdx

  1354         cmp         rdi,        rcx

  1355         jne         .next_row8x8

  1357     ;add rsp, 144

  1358     pop rsp

  1359     ; begin epilog

  1360     pop rdi

  1361     pop rsi

  1362     RESTORE_GOT

  1363     RESTORE_XMM

  1364     UNSHADOW_ARGS

  1365     pop         rbp

  1366     ret

  1369 SECTION_RODATA

  1370 align 16

  1371 rd:

  1372     times 8 dw 0x40

The Tor Browser / file revision

media/libvpx/vp8/common/x86/subpixel_sse2.asm@ac0c01689b40

media/libvpx/vp8/common/x86/subpixel_sse2.asm