media/libvpx/vp8/common/x86/subpixel_sse2.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/common/x86/subpixel_sse2.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1372 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +
    1.15 +%include "vpx_ports/x86_abi_support.asm"
    1.16 +extern sym(vp8_bilinear_filters_x86_8)
    1.17 +
    1.18 +%define BLOCK_HEIGHT_WIDTH 4
    1.19 +%define VP8_FILTER_WEIGHT 128
    1.20 +%define VP8_FILTER_SHIFT  7
    1.21 +
    1.22 +
    1.23 +;/************************************************************************************
    1.24 +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
    1.25 +; input pixel array has output_height rows. This routine assumes that output_height is an
    1.26 +; even number. This function handles 8 pixels in horizontal direction, calculating ONE
    1.27 +; rows each iteration to take advantage of the 128 bits operations.
    1.28 +;*************************************************************************************/
    1.29 +;void vp8_filter_block1d8_h6_sse2
    1.30 +;(
    1.31 +;    unsigned char  *src_ptr,
    1.32 +;    unsigned short *output_ptr,
    1.33 +;    unsigned int    src_pixels_per_line,
    1.34 +;    unsigned int    pixel_step,
    1.35 +;    unsigned int    output_height,
    1.36 +;    unsigned int    output_width,
    1.37 +;    short           *vp8_filter
    1.38 +;)
    1.39 +global sym(vp8_filter_block1d8_h6_sse2) PRIVATE
    1.40 +sym(vp8_filter_block1d8_h6_sse2):
    1.41 +    push        rbp
    1.42 +    mov         rbp, rsp
    1.43 +    SHADOW_ARGS_TO_STACK 7
    1.44 +    SAVE_XMM 7
    1.45 +    GET_GOT     rbx
    1.46 +    push        rsi
    1.47 +    push        rdi
    1.48 +    ; end prolog
    1.49 +
    1.50 +        mov         rdx,        arg(6) ;vp8_filter
    1.51 +        mov         rsi,        arg(0) ;src_ptr
    1.52 +
    1.53 +        mov         rdi,        arg(1) ;output_ptr
    1.54 +
    1.55 +        movsxd      rcx,        dword ptr arg(4) ;output_height
    1.56 +        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
    1.57 +%if ABI_IS_32BIT=0
    1.58 +        movsxd      r8,         dword ptr arg(5) ;output_width
    1.59 +%endif
    1.60 +        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
    1.61 +
    1.62 +.filter_block1d8_h6_rowloop:
    1.63 +        movq        xmm3,       MMWORD PTR [rsi - 2]
    1.64 +        movq        xmm1,       MMWORD PTR [rsi + 6]
    1.65 +
    1.66 +        prefetcht2  [rsi+rax-2]
    1.67 +
    1.68 +        pslldq      xmm1,       8
    1.69 +        por         xmm1,       xmm3
    1.70 +
    1.71 +        movdqa      xmm4,       xmm1
    1.72 +        movdqa      xmm5,       xmm1
    1.73 +
    1.74 +        movdqa      xmm6,       xmm1
    1.75 +        movdqa      xmm7,       xmm1
    1.76 +
    1.77 +        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
    1.78 +        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
    1.79 +
    1.80 +        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
    1.81 +        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
    1.82 +
    1.83 +        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
    1.84 +        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
    1.85 +
    1.86 +
    1.87 +        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
    1.88 +        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
    1.89 +
    1.90 +        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
    1.91 +
    1.92 +        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
    1.93 +        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
    1.94 +
    1.95 +        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
    1.96 +
    1.97 +        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
    1.98 +        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
    1.99 +
   1.100 +
   1.101 +        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
   1.102 +
   1.103 +        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
   1.104 +        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
   1.105 +
   1.106 +
   1.107 +        paddsw      xmm4,       xmm7
   1.108 +        paddsw      xmm4,       xmm5
   1.109 +
   1.110 +        paddsw      xmm4,       xmm3
   1.111 +        paddsw      xmm4,       xmm6
   1.112 +
   1.113 +        paddsw      xmm4,       xmm1
   1.114 +        paddsw      xmm4,       [GLOBAL(rd)]
   1.115 +
   1.116 +        psraw       xmm4,       7
   1.117 +
   1.118 +        packuswb    xmm4,       xmm0
   1.119 +        punpcklbw   xmm4,       xmm0
   1.120 +
   1.121 +        movdqa      XMMWORD Ptr [rdi],         xmm4
   1.122 +        lea         rsi,        [rsi + rax]
   1.123 +
   1.124 +%if ABI_IS_32BIT
   1.125 +        add         rdi,        DWORD Ptr arg(5) ;[output_width]
   1.126 +%else
   1.127 +        add         rdi,        r8
   1.128 +%endif
   1.129 +        dec         rcx
   1.130 +
   1.131 +        jnz         .filter_block1d8_h6_rowloop                ; next row
   1.132 +
   1.133 +    ; begin epilog
   1.134 +    pop rdi
   1.135 +    pop rsi
   1.136 +    RESTORE_GOT
   1.137 +    RESTORE_XMM
   1.138 +    UNSHADOW_ARGS
   1.139 +    pop         rbp
   1.140 +    ret
   1.141 +
   1.142 +
   1.143 +;void vp8_filter_block1d16_h6_sse2
   1.144 +;(
   1.145 +;    unsigned char  *src_ptr,
   1.146 +;    unsigned short *output_ptr,
   1.147 +;    unsigned int    src_pixels_per_line,
   1.148 +;    unsigned int    pixel_step,
   1.149 +;    unsigned int    output_height,
   1.150 +;    unsigned int    output_width,
   1.151 +;    short           *vp8_filter
   1.152 +;)
   1.153 +;/************************************************************************************
   1.154 +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
   1.155 +; input pixel array has output_height rows. This routine assumes that output_height is an
   1.156 +; even number. This function handles 8 pixels in horizontal direction, calculating ONE
   1.157 +; rows each iteration to take advantage of the 128 bits operations.
   1.158 +;*************************************************************************************/
   1.159 +global sym(vp8_filter_block1d16_h6_sse2) PRIVATE
   1.160 +sym(vp8_filter_block1d16_h6_sse2):
   1.161 +    push        rbp
   1.162 +    mov         rbp, rsp
   1.163 +    SHADOW_ARGS_TO_STACK 7
   1.164 +    SAVE_XMM 7
   1.165 +    GET_GOT     rbx
   1.166 +    push        rsi
   1.167 +    push        rdi
   1.168 +    ; end prolog
   1.169 +
   1.170 +        mov         rdx,        arg(6) ;vp8_filter
   1.171 +        mov         rsi,        arg(0) ;src_ptr
   1.172 +
   1.173 +        mov         rdi,        arg(1) ;output_ptr
   1.174 +
   1.175 +        movsxd      rcx,        dword ptr arg(4) ;output_height
   1.176 +        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
   1.177 +%if ABI_IS_32BIT=0
   1.178 +        movsxd      r8,         dword ptr arg(5) ;output_width
   1.179 +%endif
   1.180 +
   1.181 +        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
   1.182 +
   1.183 +.filter_block1d16_h6_sse2_rowloop:
   1.184 +        movq        xmm3,       MMWORD PTR [rsi - 2]
   1.185 +        movq        xmm1,       MMWORD PTR [rsi + 6]
   1.186 +
   1.187 +        movq        xmm2,       MMWORD PTR [rsi +14]
   1.188 +        pslldq      xmm2,       8
   1.189 +
   1.190 +        por         xmm2,       xmm1
   1.191 +        prefetcht2  [rsi+rax-2]
   1.192 +
   1.193 +        pslldq      xmm1,       8
   1.194 +        por         xmm1,       xmm3
   1.195 +
   1.196 +        movdqa      xmm4,       xmm1
   1.197 +        movdqa      xmm5,       xmm1
   1.198 +
   1.199 +        movdqa      xmm6,       xmm1
   1.200 +        movdqa      xmm7,       xmm1
   1.201 +
   1.202 +        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
   1.203 +        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
   1.204 +
   1.205 +        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
   1.206 +        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
   1.207 +
   1.208 +        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
   1.209 +        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
   1.210 +
   1.211 +
   1.212 +        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
   1.213 +        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
   1.214 +
   1.215 +        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
   1.216 +
   1.217 +        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
   1.218 +        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
   1.219 +
   1.220 +        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
   1.221 +
   1.222 +        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
   1.223 +        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
   1.224 +
   1.225 +
   1.226 +        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
   1.227 +
   1.228 +        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
   1.229 +        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
   1.230 +
   1.231 +        paddsw      xmm4,       xmm7
   1.232 +        paddsw      xmm4,       xmm5
   1.233 +
   1.234 +        paddsw      xmm4,       xmm3
   1.235 +        paddsw      xmm4,       xmm6
   1.236 +
   1.237 +        paddsw      xmm4,       xmm1
   1.238 +        paddsw      xmm4,       [GLOBAL(rd)]
   1.239 +
   1.240 +        psraw       xmm4,       7
   1.241 +
   1.242 +        packuswb    xmm4,       xmm0
   1.243 +        punpcklbw   xmm4,       xmm0
   1.244 +
   1.245 +        movdqa      XMMWORD Ptr [rdi],         xmm4
   1.246 +
   1.247 +        movdqa      xmm3,       xmm2
   1.248 +        movdqa      xmm4,       xmm2
   1.249 +
   1.250 +        movdqa      xmm5,       xmm2
   1.251 +        movdqa      xmm6,       xmm2
   1.252 +
   1.253 +        movdqa      xmm7,       xmm2
   1.254 +
   1.255 +        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
   1.256 +        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
   1.257 +
   1.258 +        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
   1.259 +        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
   1.260 +
   1.261 +        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
   1.262 +        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
   1.263 +
   1.264 +
   1.265 +        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
   1.266 +        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
   1.267 +
   1.268 +        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
   1.269 +
   1.270 +        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
   1.271 +        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
   1.272 +
   1.273 +        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
   1.274 +
   1.275 +        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
   1.276 +        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
   1.277 +
   1.278 +        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
   1.279 +
   1.280 +        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
   1.281 +        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
   1.282 +
   1.283 +
   1.284 +        paddsw      xmm4,       xmm7
   1.285 +        paddsw      xmm4,       xmm5
   1.286 +
   1.287 +        paddsw      xmm4,       xmm3
   1.288 +        paddsw      xmm4,       xmm6
   1.289 +
   1.290 +        paddsw      xmm4,       xmm2
   1.291 +        paddsw      xmm4,       [GLOBAL(rd)]
   1.292 +
   1.293 +        psraw       xmm4,       7
   1.294 +
   1.295 +        packuswb    xmm4,       xmm0
   1.296 +        punpcklbw   xmm4,       xmm0
   1.297 +
   1.298 +        movdqa      XMMWORD Ptr [rdi+16],      xmm4
   1.299 +
   1.300 +        lea         rsi,        [rsi + rax]
   1.301 +%if ABI_IS_32BIT
   1.302 +        add         rdi,        DWORD Ptr arg(5) ;[output_width]
   1.303 +%else
   1.304 +        add         rdi,        r8
   1.305 +%endif
   1.306 +
   1.307 +        dec         rcx
   1.308 +        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row
   1.309 +
   1.310 +    ; begin epilog
   1.311 +    pop rdi
   1.312 +    pop rsi
   1.313 +    RESTORE_GOT
   1.314 +    RESTORE_XMM
   1.315 +    UNSHADOW_ARGS
   1.316 +    pop         rbp
   1.317 +    ret
   1.318 +
   1.319 +
   1.320 +;void vp8_filter_block1d8_v6_sse2
   1.321 +;(
   1.322 +;    short *src_ptr,
   1.323 +;    unsigned char *output_ptr,
   1.324 +;    int dst_ptich,
   1.325 +;    unsigned int pixels_per_line,
   1.326 +;    unsigned int pixel_step,
   1.327 +;    unsigned int output_height,
   1.328 +;    unsigned int output_width,
   1.329 +;    short * vp8_filter
   1.330 +;)
   1.331 +;/************************************************************************************
   1.332 +; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
   1.333 +; input pixel array has output_height rows.
   1.334 +;*************************************************************************************/
   1.335 +global sym(vp8_filter_block1d8_v6_sse2) PRIVATE
   1.336 +sym(vp8_filter_block1d8_v6_sse2):
   1.337 +    push        rbp
   1.338 +    mov         rbp, rsp
   1.339 +    SHADOW_ARGS_TO_STACK 8
   1.340 +    SAVE_XMM 7
   1.341 +    GET_GOT     rbx
   1.342 +    push        rsi
   1.343 +    push        rdi
   1.344 +    ; end prolog
   1.345 +
   1.346 +        mov         rax,        arg(7) ;vp8_filter
   1.347 +        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
   1.348 +
   1.349 +        mov         rdi,        arg(1) ;output_ptr
   1.350 +        mov         rsi,        arg(0) ;src_ptr
   1.351 +
   1.352 +        sub         rsi,        rdx
   1.353 +        sub         rsi,        rdx
   1.354 +
   1.355 +        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
   1.356 +        pxor        xmm0,       xmm0                        ; clear xmm0
   1.357 +
   1.358 +        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
   1.359 +%if ABI_IS_32BIT=0
   1.360 +        movsxd      r8,         dword ptr arg(2) ; dst_ptich
   1.361 +%endif
   1.362 +
   1.363 +.vp8_filter_block1d8_v6_sse2_loop:
   1.364 +        movdqa      xmm1,       XMMWORD PTR [rsi]
   1.365 +        pmullw      xmm1,       [rax]
   1.366 +
   1.367 +        movdqa      xmm2,       XMMWORD PTR [rsi + rdx]
   1.368 +        pmullw      xmm2,       [rax + 16]
   1.369 +
   1.370 +        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]
   1.371 +        pmullw      xmm3,       [rax + 32]
   1.372 +
   1.373 +        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]
   1.374 +        pmullw      xmm5,       [rax + 64]
   1.375 +
   1.376 +        add         rsi,        rdx
   1.377 +        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2]
   1.378 +
   1.379 +        pmullw      xmm4,       [rax + 48]
   1.380 +        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4]
   1.381 +
   1.382 +        pmullw      xmm6,       [rax + 80]
   1.383 +
   1.384 +        paddsw      xmm2,       xmm5
   1.385 +        paddsw      xmm2,       xmm3
   1.386 +
   1.387 +        paddsw      xmm2,       xmm1
   1.388 +        paddsw      xmm2,       xmm4
   1.389 +
   1.390 +        paddsw      xmm2,       xmm6
   1.391 +        paddsw      xmm2,       xmm7
   1.392 +
   1.393 +        psraw       xmm2,       7
   1.394 +        packuswb    xmm2,       xmm0              ; pack and saturate
   1.395 +
   1.396 +        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
   1.397 +%if ABI_IS_32BIT
   1.398 +        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
   1.399 +%else
   1.400 +        add         rdi,        r8
   1.401 +%endif
   1.402 +        dec         rcx         ; decrement count
   1.403 +        jnz         .vp8_filter_block1d8_v6_sse2_loop               ; next row
   1.404 +
   1.405 +    ; begin epilog
   1.406 +    pop rdi
   1.407 +    pop rsi
   1.408 +    RESTORE_GOT
   1.409 +    RESTORE_XMM
   1.410 +    UNSHADOW_ARGS
   1.411 +    pop         rbp
   1.412 +    ret
   1.413 +
   1.414 +
   1.415 +;void vp8_filter_block1d16_v6_sse2
   1.416 +;(
   1.417 +;    unsigned short *src_ptr,
   1.418 +;    unsigned char *output_ptr,
   1.419 +;    int dst_ptich,
   1.420 +;    unsigned int pixels_per_line,
   1.421 +;    unsigned int pixel_step,
   1.422 +;    unsigned int output_height,
   1.423 +;    unsigned int output_width,
   1.424 +;    const short    *vp8_filter
   1.425 +;)
   1.426 +;/************************************************************************************
   1.427 +; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
   1.428 +; input pixel array has output_height rows.
   1.429 +;*************************************************************************************/
   1.430 +global sym(vp8_filter_block1d16_v6_sse2) PRIVATE
   1.431 +sym(vp8_filter_block1d16_v6_sse2):
   1.432 +    push        rbp
   1.433 +    mov         rbp, rsp
   1.434 +    SHADOW_ARGS_TO_STACK 8
   1.435 +    SAVE_XMM 7
   1.436 +    GET_GOT     rbx
   1.437 +    push        rsi
   1.438 +    push        rdi
   1.439 +    ; end prolog
   1.440 +
   1.441 +        mov         rax,        arg(7) ;vp8_filter
   1.442 +        movsxd      rdx,        dword ptr arg(3) ;pixels_per_line
   1.443 +
   1.444 +        mov         rdi,        arg(1) ;output_ptr
   1.445 +        mov         rsi,        arg(0) ;src_ptr
   1.446 +
   1.447 +        sub         rsi,        rdx
   1.448 +        sub         rsi,        rdx
   1.449 +
   1.450 +        movsxd      rcx,        DWORD PTR arg(5) ;[output_height]
   1.451 +%if ABI_IS_32BIT=0
   1.452 +        movsxd      r8,         dword ptr arg(2) ; dst_ptich
   1.453 +%endif
   1.454 +
   1.455 +.vp8_filter_block1d16_v6_sse2_loop:
   1.456 +; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
   1.457 +        movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
   1.458 +        movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
   1.459 +        pmullw      xmm1,       [rax + 16]
   1.460 +        pmullw      xmm2,       [rax + 16]
   1.461 +
   1.462 +        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 4]       ; line 5
   1.463 +        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 4 + 16]
   1.464 +        pmullw      xmm3,       [rax + 64]
   1.465 +        pmullw      xmm4,       [rax + 64]
   1.466 +
   1.467 +        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 2]       ; line 3
   1.468 +        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 2 + 16]
   1.469 +        pmullw      xmm5,       [rax + 32]
   1.470 +        pmullw      xmm6,       [rax + 32]
   1.471 +
   1.472 +        movdqa      xmm7,       XMMWORD PTR [rsi]       ; line 1
   1.473 +        movdqa      xmm0,       XMMWORD PTR [rsi + 16]
   1.474 +        pmullw      xmm7,       [rax]
   1.475 +        pmullw      xmm0,       [rax]
   1.476 +
   1.477 +        paddsw      xmm1,       xmm3
   1.478 +        paddsw      xmm2,       xmm4
   1.479 +        paddsw      xmm1,       xmm5
   1.480 +        paddsw      xmm2,       xmm6
   1.481 +        paddsw      xmm1,       xmm7
   1.482 +        paddsw      xmm2,       xmm0
   1.483 +
   1.484 +        add         rsi,        rdx
   1.485 +
   1.486 +        movdqa      xmm3,       XMMWORD PTR [rsi + rdx * 2]       ; line 4
   1.487 +        movdqa      xmm4,       XMMWORD PTR [rsi + rdx * 2 + 16]
   1.488 +        pmullw      xmm3,       [rax + 48]
   1.489 +        pmullw      xmm4,       [rax + 48]
   1.490 +
   1.491 +        movdqa      xmm5,       XMMWORD PTR [rsi + rdx * 4]       ; line 6
   1.492 +        movdqa      xmm6,       XMMWORD PTR [rsi + rdx * 4 + 16]
   1.493 +        pmullw      xmm5,       [rax + 80]
   1.494 +        pmullw      xmm6,       [rax + 80]
   1.495 +
   1.496 +        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
   1.497 +        pxor        xmm0,       xmm0                        ; clear xmm0
   1.498 +
   1.499 +        paddsw      xmm1,       xmm3
   1.500 +        paddsw      xmm2,       xmm4
   1.501 +        paddsw      xmm1,       xmm5
   1.502 +        paddsw      xmm2,       xmm6
   1.503 +
   1.504 +        paddsw      xmm1,       xmm7
   1.505 +        paddsw      xmm2,       xmm7
   1.506 +
   1.507 +        psraw       xmm1,       7
   1.508 +        psraw       xmm2,       7
   1.509 +
   1.510 +        packuswb    xmm1,       xmm2              ; pack and saturate
   1.511 +        movdqa      XMMWORD PTR [rdi], xmm1       ; store the results in the destination
   1.512 +%if ABI_IS_32BIT
   1.513 +        add         rdi,        DWORD PTR arg(2) ;[dst_ptich]
   1.514 +%else
   1.515 +        add         rdi,        r8
   1.516 +%endif
   1.517 +        dec         rcx         ; decrement count
   1.518 +        jnz         .vp8_filter_block1d16_v6_sse2_loop              ; next row
   1.519 +
   1.520 +    ; begin epilog
   1.521 +    pop rdi
   1.522 +    pop rsi
   1.523 +    RESTORE_GOT
   1.524 +    RESTORE_XMM
   1.525 +    UNSHADOW_ARGS
   1.526 +    pop         rbp
   1.527 +    ret
   1.528 +
   1.529 +
   1.530 +;void vp8_filter_block1d8_h6_only_sse2
   1.531 +;(
   1.532 +;    unsigned char  *src_ptr,
   1.533 +;    unsigned int    src_pixels_per_line,
   1.534 +;    unsigned char  *output_ptr,
   1.535 +;    int dst_ptich,
   1.536 +;    unsigned int    output_height,
   1.537 +;    const short    *vp8_filter
   1.538 +;)
   1.539 +; First-pass filter only when yoffset==0
   1.540 +global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE
   1.541 +sym(vp8_filter_block1d8_h6_only_sse2):
   1.542 +    push        rbp
   1.543 +    mov         rbp, rsp
   1.544 +    SHADOW_ARGS_TO_STACK 6
   1.545 +    SAVE_XMM 7
   1.546 +    GET_GOT     rbx
   1.547 +    push        rsi
   1.548 +    push        rdi
   1.549 +    ; end prolog
   1.550 +
   1.551 +        mov         rdx,        arg(5) ;vp8_filter
   1.552 +        mov         rsi,        arg(0) ;src_ptr
   1.553 +
   1.554 +        mov         rdi,        arg(2) ;output_ptr
   1.555 +
   1.556 +        movsxd      rcx,        dword ptr arg(4) ;output_height
   1.557 +        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
   1.558 +%if ABI_IS_32BIT=0
   1.559 +        movsxd      r8,         dword ptr arg(3) ;dst_ptich
   1.560 +%endif
   1.561 +        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
   1.562 +
   1.563 +.filter_block1d8_h6_only_rowloop:
   1.564 +        movq        xmm3,       MMWORD PTR [rsi - 2]
   1.565 +        movq        xmm1,       MMWORD PTR [rsi + 6]
   1.566 +
   1.567 +        prefetcht2  [rsi+rax-2]
   1.568 +
   1.569 +        pslldq      xmm1,       8
   1.570 +        por         xmm1,       xmm3
   1.571 +
   1.572 +        movdqa      xmm4,       xmm1
   1.573 +        movdqa      xmm5,       xmm1
   1.574 +
   1.575 +        movdqa      xmm6,       xmm1
   1.576 +        movdqa      xmm7,       xmm1
   1.577 +
   1.578 +        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
   1.579 +        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
   1.580 +
   1.581 +        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
   1.582 +        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
   1.583 +
   1.584 +        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
   1.585 +        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
   1.586 +
   1.587 +
   1.588 +        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
   1.589 +        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
   1.590 +
   1.591 +        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
   1.592 +
   1.593 +        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
   1.594 +        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
   1.595 +
   1.596 +        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
   1.597 +
   1.598 +        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
   1.599 +        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
   1.600 +
   1.601 +
   1.602 +        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
   1.603 +
   1.604 +        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
   1.605 +        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
   1.606 +
   1.607 +
   1.608 +        paddsw      xmm4,       xmm7
   1.609 +        paddsw      xmm4,       xmm5
   1.610 +
   1.611 +        paddsw      xmm4,       xmm3
   1.612 +        paddsw      xmm4,       xmm6
   1.613 +
   1.614 +        paddsw      xmm4,       xmm1
   1.615 +        paddsw      xmm4,       [GLOBAL(rd)]
   1.616 +
   1.617 +        psraw       xmm4,       7
   1.618 +
   1.619 +        packuswb    xmm4,       xmm0
   1.620 +
   1.621 +        movq        QWORD PTR [rdi],   xmm4       ; store the results in the destination
   1.622 +        lea         rsi,        [rsi + rax]
   1.623 +
   1.624 +%if ABI_IS_32BIT
   1.625 +        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
   1.626 +%else
   1.627 +        add         rdi,        r8
   1.628 +%endif
   1.629 +        dec         rcx
   1.630 +
   1.631 +        jnz         .filter_block1d8_h6_only_rowloop               ; next row
   1.632 +
   1.633 +    ; begin epilog
   1.634 +    pop rdi
   1.635 +    pop rsi
   1.636 +    RESTORE_GOT
   1.637 +    RESTORE_XMM
   1.638 +    UNSHADOW_ARGS
   1.639 +    pop         rbp
   1.640 +    ret
   1.641 +
   1.642 +
   1.643 +;void vp8_filter_block1d16_h6_only_sse2
   1.644 +;(
   1.645 +;    unsigned char  *src_ptr,
   1.646 +;    unsigned int    src_pixels_per_line,
   1.647 +;    unsigned char  *output_ptr,
   1.648 +;    int dst_ptich,
   1.649 +;    unsigned int    output_height,
   1.650 +;    const short    *vp8_filter
   1.651 +;)
   1.652 +; First-pass filter only when yoffset==0
   1.653 +global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE
   1.654 +sym(vp8_filter_block1d16_h6_only_sse2):
   1.655 +    push        rbp
   1.656 +    mov         rbp, rsp
   1.657 +    SHADOW_ARGS_TO_STACK 6
   1.658 +    SAVE_XMM 7
   1.659 +    GET_GOT     rbx
   1.660 +    push        rsi
   1.661 +    push        rdi
   1.662 +    ; end prolog
   1.663 +
   1.664 +        mov         rdx,        arg(5) ;vp8_filter
   1.665 +        mov         rsi,        arg(0) ;src_ptr
   1.666 +
   1.667 +        mov         rdi,        arg(2) ;output_ptr
   1.668 +
   1.669 +        movsxd      rcx,        dword ptr arg(4) ;output_height
   1.670 +        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line            ; Pitch for Source
   1.671 +%if ABI_IS_32BIT=0
   1.672 +        movsxd      r8,         dword ptr arg(3) ;dst_ptich
   1.673 +%endif
   1.674 +
   1.675 +        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
   1.676 +
   1.677 +.filter_block1d16_h6_only_sse2_rowloop:
   1.678 +        movq        xmm3,       MMWORD PTR [rsi - 2]
   1.679 +        movq        xmm1,       MMWORD PTR [rsi + 6]
   1.680 +
   1.681 +        movq        xmm2,       MMWORD PTR [rsi +14]
   1.682 +        pslldq      xmm2,       8
   1.683 +
   1.684 +        por         xmm2,       xmm1
   1.685 +        prefetcht2  [rsi+rax-2]
   1.686 +
   1.687 +        pslldq      xmm1,       8
   1.688 +        por         xmm1,       xmm3
   1.689 +
   1.690 +        movdqa      xmm4,       xmm1
   1.691 +        movdqa      xmm5,       xmm1
   1.692 +
   1.693 +        movdqa      xmm6,       xmm1
   1.694 +        movdqa      xmm7,       xmm1
   1.695 +
   1.696 +        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
   1.697 +        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
   1.698 +
   1.699 +        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
   1.700 +        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
   1.701 +
   1.702 +        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
   1.703 +        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
   1.704 +
   1.705 +        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
   1.706 +        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
   1.707 +
   1.708 +        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
   1.709 +
   1.710 +        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
   1.711 +        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
   1.712 +
   1.713 +        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
   1.714 +
   1.715 +        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
   1.716 +        psrldq      xmm1,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
   1.717 +
   1.718 +        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
   1.719 +
   1.720 +        punpcklbw   xmm1,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
   1.721 +        pmullw      xmm1,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
   1.722 +
   1.723 +        paddsw      xmm4,       xmm7
   1.724 +        paddsw      xmm4,       xmm5
   1.725 +
   1.726 +        paddsw      xmm4,       xmm3
   1.727 +        paddsw      xmm4,       xmm6
   1.728 +
   1.729 +        paddsw      xmm4,       xmm1
   1.730 +        paddsw      xmm4,       [GLOBAL(rd)]
   1.731 +
   1.732 +        psraw       xmm4,       7
   1.733 +
   1.734 +        packuswb    xmm4,       xmm0                        ; lower 8 bytes
   1.735 +
   1.736 +        movq        QWORD Ptr [rdi],         xmm4           ; store the results in the destination
   1.737 +
   1.738 +        movdqa      xmm3,       xmm2
   1.739 +        movdqa      xmm4,       xmm2
   1.740 +
   1.741 +        movdqa      xmm5,       xmm2
   1.742 +        movdqa      xmm6,       xmm2
   1.743 +
   1.744 +        movdqa      xmm7,       xmm2
   1.745 +
   1.746 +        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
   1.747 +        psrldq      xmm4,       1                           ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
   1.748 +
   1.749 +        pmullw      xmm3,       XMMWORD PTR [rdx]           ; x[-2] * H[-2]; Tap 1
   1.750 +        punpcklbw   xmm4,       xmm0                        ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
   1.751 +
   1.752 +        psrldq      xmm5,       2                           ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
   1.753 +        pmullw      xmm4,       XMMWORD PTR [rdx+16]        ; x[-1] * H[-1]; Tap 2
   1.754 +
   1.755 +        punpcklbw   xmm5,       xmm0                        ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
   1.756 +        psrldq      xmm6,       3                           ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
   1.757 +
   1.758 +        pmullw      xmm5,       [rdx+32]                    ; x[ 0] * H[ 0]; Tap 3
   1.759 +
   1.760 +        punpcklbw   xmm6,       xmm0                        ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
   1.761 +        psrldq      xmm7,       4                           ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
   1.762 +
   1.763 +        pmullw      xmm6,       [rdx+48]                    ; x[ 1] * h[ 1] ; Tap 4
   1.764 +
   1.765 +        punpcklbw   xmm7,       xmm0                        ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
   1.766 +        psrldq      xmm2,       5                           ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
   1.767 +
   1.768 +        pmullw      xmm7,       [rdx+64]                    ; x[ 2] * h[ 2] ; Tap 5
   1.769 +
   1.770 +        punpcklbw   xmm2,       xmm0                        ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
   1.771 +        pmullw      xmm2,       [rdx+80]                    ; x[ 3] * h[ 3] ; Tap 6
   1.772 +
   1.773 +        paddsw      xmm4,       xmm7
   1.774 +        paddsw      xmm4,       xmm5
   1.775 +
   1.776 +        paddsw      xmm4,       xmm3
   1.777 +        paddsw      xmm4,       xmm6
   1.778 +
   1.779 +        paddsw      xmm4,       xmm2
   1.780 +        paddsw      xmm4,       [GLOBAL(rd)]
   1.781 +
   1.782 +        psraw       xmm4,       7
   1.783 +
   1.784 +        packuswb    xmm4,       xmm0                        ; higher 8 bytes
   1.785 +
   1.786 +        movq        QWORD Ptr [rdi+8],      xmm4            ; store the results in the destination
   1.787 +
   1.788 +        lea         rsi,        [rsi + rax]
   1.789 +%if ABI_IS_32BIT
   1.790 +        add         rdi,        DWORD Ptr arg(3) ;dst_ptich
   1.791 +%else
   1.792 +        add         rdi,        r8
   1.793 +%endif
   1.794 +
   1.795 +        dec         rcx
   1.796 +        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row
   1.797 +
   1.798 +    ; begin epilog
   1.799 +    pop rdi
   1.800 +    pop rsi
   1.801 +    RESTORE_GOT
   1.802 +    RESTORE_XMM
   1.803 +    UNSHADOW_ARGS
   1.804 +    pop         rbp
   1.805 +    ret
   1.806 +
   1.807 +
   1.808 +;void vp8_filter_block1d8_v6_only_sse2
   1.809 +;(
   1.810 +;    unsigned char *src_ptr,
   1.811 +;    unsigned int    src_pixels_per_line,
   1.812 +;    unsigned char *output_ptr,
   1.813 +;    int dst_ptich,
   1.814 +;    unsigned int output_height,
   1.815 +;    const short    *vp8_filter
   1.816 +;)
   1.817 +; Second-pass filter only when xoffset==0
   1.818 +global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE
   1.819 +sym(vp8_filter_block1d8_v6_only_sse2):
   1.820 +    push        rbp
   1.821 +    mov         rbp, rsp
   1.822 +    SHADOW_ARGS_TO_STACK 6
   1.823 +    SAVE_XMM 7
   1.824 +    GET_GOT     rbx
   1.825 +    push        rsi
   1.826 +    push        rdi
   1.827 +    ; end prolog
   1.828 +
   1.829 +        mov         rsi,        arg(0) ;src_ptr
   1.830 +        mov         rdi,        arg(2) ;output_ptr
   1.831 +
   1.832 +        movsxd      rcx,        dword ptr arg(4) ;output_height
   1.833 +        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
   1.834 +
   1.835 +        mov         rax,        arg(5) ;vp8_filter
   1.836 +
   1.837 +        pxor        xmm0,       xmm0                        ; clear xmm0
   1.838 +
   1.839 +        movdqa      xmm7,       XMMWORD PTR [GLOBAL(rd)]
   1.840 +%if ABI_IS_32BIT=0
   1.841 +        movsxd      r8,         dword ptr arg(3) ; dst_ptich
   1.842 +%endif
   1.843 +
   1.844 +.vp8_filter_block1d8_v6_only_sse2_loop:
   1.845 +        movq        xmm1,       MMWORD PTR [rsi]
   1.846 +        movq        xmm2,       MMWORD PTR [rsi + rdx]
   1.847 +        movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
   1.848 +        movq        xmm5,       MMWORD PTR [rsi + rdx * 4]
   1.849 +        add         rsi,        rdx
   1.850 +        movq        xmm4,       MMWORD PTR [rsi + rdx * 2]
   1.851 +        movq        xmm6,       MMWORD PTR [rsi + rdx * 4]
   1.852 +
   1.853 +        punpcklbw   xmm1,       xmm0
   1.854 +        pmullw      xmm1,       [rax]
   1.855 +
   1.856 +        punpcklbw   xmm2,       xmm0
   1.857 +        pmullw      xmm2,       [rax + 16]
   1.858 +
   1.859 +        punpcklbw   xmm3,       xmm0
   1.860 +        pmullw      xmm3,       [rax + 32]
   1.861 +
   1.862 +        punpcklbw   xmm5,       xmm0
   1.863 +        pmullw      xmm5,       [rax + 64]
   1.864 +
   1.865 +        punpcklbw   xmm4,       xmm0
   1.866 +        pmullw      xmm4,       [rax + 48]
   1.867 +
   1.868 +        punpcklbw   xmm6,       xmm0
   1.869 +        pmullw      xmm6,       [rax + 80]
   1.870 +
   1.871 +        paddsw      xmm2,       xmm5
   1.872 +        paddsw      xmm2,       xmm3
   1.873 +
   1.874 +        paddsw      xmm2,       xmm1
   1.875 +        paddsw      xmm2,       xmm4
   1.876 +
   1.877 +        paddsw      xmm2,       xmm6
   1.878 +        paddsw      xmm2,       xmm7
   1.879 +
   1.880 +        psraw       xmm2,       7
   1.881 +        packuswb    xmm2,       xmm0              ; pack and saturate
   1.882 +
   1.883 +        movq        QWORD PTR [rdi], xmm2         ; store the results in the destination
   1.884 +%if ABI_IS_32BIT
   1.885 +        add         rdi,        DWORD PTR arg(3) ;[dst_ptich]
   1.886 +%else
   1.887 +        add         rdi,        r8
   1.888 +%endif
   1.889 +        dec         rcx         ; decrement count
   1.890 +        jnz         .vp8_filter_block1d8_v6_only_sse2_loop              ; next row
   1.891 +
   1.892 +    ; begin epilog
   1.893 +    pop rdi
   1.894 +    pop rsi
   1.895 +    RESTORE_GOT
   1.896 +    RESTORE_XMM
   1.897 +    UNSHADOW_ARGS
   1.898 +    pop         rbp
   1.899 +    ret
   1.900 +
   1.901 +
   1.902 +;void vp8_unpack_block1d16_h6_sse2
   1.903 +;(
   1.904 +;    unsigned char  *src_ptr,
   1.905 +;    unsigned short *output_ptr,
   1.906 +;    unsigned int    src_pixels_per_line,
   1.907 +;    unsigned int    output_height,
   1.908 +;    unsigned int    output_width
   1.909 +;)
   1.910 +global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE
   1.911 +sym(vp8_unpack_block1d16_h6_sse2):
   1.912 +    push        rbp
   1.913 +    mov         rbp, rsp
   1.914 +    SHADOW_ARGS_TO_STACK 5
   1.915 +    GET_GOT     rbx
   1.916 +    push        rsi
   1.917 +    push        rdi
   1.918 +    ; end prolog
   1.919 +
   1.920 +        mov         rsi,        arg(0) ;src_ptr
   1.921 +        mov         rdi,        arg(1) ;output_ptr
   1.922 +
   1.923 +        movsxd      rcx,        dword ptr arg(3) ;output_height
   1.924 +        movsxd      rax,        dword ptr arg(2) ;src_pixels_per_line            ; Pitch for Source
   1.925 +
   1.926 +        pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
   1.927 +%if ABI_IS_32BIT=0
   1.928 +        movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
   1.929 +%endif
   1.930 +
   1.931 +.unpack_block1d16_h6_sse2_rowloop:
   1.932 +        movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
   1.933 +        movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
   1.934 +
   1.935 +        punpcklbw   xmm3,       xmm0                        ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
   1.936 +        punpcklbw   xmm1,       xmm0
   1.937 +
   1.938 +        movdqa      XMMWORD Ptr [rdi],         xmm1
   1.939 +        movdqa      XMMWORD Ptr [rdi + 16],    xmm3
   1.940 +
   1.941 +        lea         rsi,        [rsi + rax]
   1.942 +%if ABI_IS_32BIT
   1.943 +        add         rdi,        DWORD Ptr arg(4) ;[output_width]
   1.944 +%else
   1.945 +        add         rdi,        r8
   1.946 +%endif
   1.947 +        dec         rcx
   1.948 +        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row
   1.949 +
   1.950 +    ; begin epilog
   1.951 +    pop rdi
   1.952 +    pop rsi
   1.953 +    RESTORE_GOT
   1.954 +    UNSHADOW_ARGS
   1.955 +    pop         rbp
   1.956 +    ret
   1.957 +
   1.958 +
   1.959 +;void vp8_bilinear_predict16x16_sse2
   1.960 +;(
   1.961 +;    unsigned char  *src_ptr,
   1.962 +;    int   src_pixels_per_line,
   1.963 +;    int  xoffset,
   1.964 +;    int  yoffset,
   1.965 +;    unsigned char *dst_ptr,
   1.966 +;    int dst_pitch
   1.967 +;)
   1.968 +extern sym(vp8_bilinear_filters_x86_8)
   1.969 +global sym(vp8_bilinear_predict16x16_sse2) PRIVATE
   1.970 +sym(vp8_bilinear_predict16x16_sse2):
   1.971 +    push        rbp
   1.972 +    mov         rbp, rsp
   1.973 +    SHADOW_ARGS_TO_STACK 6
   1.974 +    SAVE_XMM 7
   1.975 +    GET_GOT     rbx
   1.976 +    push        rsi
   1.977 +    push        rdi
   1.978 +    ; end prolog
   1.979 +
   1.980 +    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
   1.981 +    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
   1.982 +
   1.983 +        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
   1.984 +        movsxd      rax,        dword ptr arg(2) ;xoffset
   1.985 +
   1.986 +        cmp         rax,        0      ;skip first_pass filter if xoffset=0
   1.987 +        je          .b16x16_sp_only
   1.988 +
   1.989 +        shl         rax,        5
   1.990 +        add         rax,        rcx    ;HFilter
   1.991 +
   1.992 +        mov         rdi,        arg(4) ;dst_ptr
   1.993 +        mov         rsi,        arg(0) ;src_ptr
   1.994 +        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
   1.995 +
   1.996 +        movdqa      xmm1,       [rax]
   1.997 +        movdqa      xmm2,       [rax+16]
   1.998 +
   1.999 +        movsxd      rax,        dword ptr arg(3) ;yoffset
  1.1000 +
  1.1001 +        cmp         rax,        0      ;skip second_pass filter if yoffset=0
  1.1002 +        je          .b16x16_fp_only
  1.1003 +
  1.1004 +        shl         rax,        5
  1.1005 +        add         rax,        rcx    ;VFilter
  1.1006 +
  1.1007 +        lea         rcx,        [rdi+rdx*8]
  1.1008 +        lea         rcx,        [rcx+rdx*8]
  1.1009 +        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
  1.1010 +
  1.1011 +        pxor        xmm0,       xmm0
  1.1012 +
  1.1013 +%if ABI_IS_32BIT=0
  1.1014 +        movsxd      r8,         dword ptr arg(5) ;dst_pitch
  1.1015 +%endif
  1.1016 +        ; get the first horizontal line done
  1.1017 +        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  1.1018 +        movdqa      xmm4,       xmm3                 ; make a copy of current line
  1.1019 +
  1.1020 +        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
  1.1021 +        punpckhbw   xmm4,       xmm0
  1.1022 +
  1.1023 +        pmullw      xmm3,       xmm1
  1.1024 +        pmullw      xmm4,       xmm1
  1.1025 +
  1.1026 +        movdqu      xmm5,       [rsi+1]
  1.1027 +        movdqa      xmm6,       xmm5
  1.1028 +
  1.1029 +        punpcklbw   xmm5,       xmm0
  1.1030 +        punpckhbw   xmm6,       xmm0
  1.1031 +
  1.1032 +        pmullw      xmm5,       xmm2
  1.1033 +        pmullw      xmm6,       xmm2
  1.1034 +
  1.1035 +        paddw       xmm3,       xmm5
  1.1036 +        paddw       xmm4,       xmm6
  1.1037 +
  1.1038 +        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
  1.1039 +        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
  1.1040 +
  1.1041 +        paddw       xmm4,       [GLOBAL(rd)]
  1.1042 +        psraw       xmm4,       VP8_FILTER_SHIFT
  1.1043 +
  1.1044 +        movdqa      xmm7,       xmm3
  1.1045 +        packuswb    xmm7,       xmm4
  1.1046 +
  1.1047 +        add         rsi,        rdx                 ; next line
  1.1048 +.next_row:
  1.1049 +        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  1.1050 +        movdqa      xmm4,       xmm3                 ; make a copy of current line
  1.1051 +
  1.1052 +        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
  1.1053 +        punpckhbw   xmm4,       xmm0
  1.1054 +
  1.1055 +        pmullw      xmm3,       xmm1
  1.1056 +        pmullw      xmm4,       xmm1
  1.1057 +
  1.1058 +        movdqu      xmm5,       [rsi+1]
  1.1059 +        movdqa      xmm6,       xmm5
  1.1060 +
  1.1061 +        punpcklbw   xmm5,       xmm0
  1.1062 +        punpckhbw   xmm6,       xmm0
  1.1063 +
  1.1064 +        pmullw      xmm5,       xmm2
  1.1065 +        pmullw      xmm6,       xmm2
  1.1066 +
  1.1067 +        paddw       xmm3,       xmm5
  1.1068 +        paddw       xmm4,       xmm6
  1.1069 +
  1.1070 +        movdqa      xmm5,       xmm7
  1.1071 +        movdqa      xmm6,       xmm7
  1.1072 +
  1.1073 +        punpcklbw   xmm5,       xmm0
  1.1074 +        punpckhbw   xmm6,       xmm0
  1.1075 +
  1.1076 +        pmullw      xmm5,       [rax]
  1.1077 +        pmullw      xmm6,       [rax]
  1.1078 +
  1.1079 +        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
  1.1080 +        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
  1.1081 +
  1.1082 +        paddw       xmm4,       [GLOBAL(rd)]
  1.1083 +        psraw       xmm4,       VP8_FILTER_SHIFT
  1.1084 +
  1.1085 +        movdqa      xmm7,       xmm3
  1.1086 +        packuswb    xmm7,       xmm4
  1.1087 +
  1.1088 +        pmullw      xmm3,       [rax+16]
  1.1089 +        pmullw      xmm4,       [rax+16]
  1.1090 +
  1.1091 +        paddw       xmm3,       xmm5
  1.1092 +        paddw       xmm4,       xmm6
  1.1093 +
  1.1094 +        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
  1.1095 +        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
  1.1096 +
  1.1097 +        paddw       xmm4,       [GLOBAL(rd)]
  1.1098 +        psraw       xmm4,       VP8_FILTER_SHIFT
  1.1099 +
  1.1100 +        packuswb    xmm3,       xmm4
  1.1101 +        movdqa      [rdi],      xmm3                 ; store the results in the destination
  1.1102 +
  1.1103 +        add         rsi,        rdx                 ; next line
  1.1104 +%if ABI_IS_32BIT
  1.1105 +        add         rdi,        DWORD PTR arg(5) ;dst_pitch
  1.1106 +%else
  1.1107 +        add         rdi,        r8
  1.1108 +%endif
  1.1109 +
  1.1110 +        cmp         rdi,        rcx
  1.1111 +        jne         .next_row
  1.1112 +
  1.1113 +        jmp         .done
  1.1114 +
  1.1115 +.b16x16_sp_only:
  1.1116 +        movsxd      rax,        dword ptr arg(3) ;yoffset
  1.1117 +        shl         rax,        5
  1.1118 +        add         rax,        rcx    ;VFilter
  1.1119 +
  1.1120 +        mov         rdi,        arg(4) ;dst_ptr
  1.1121 +        mov         rsi,        arg(0) ;src_ptr
  1.1122 +        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
  1.1123 +
  1.1124 +        movdqa      xmm1,       [rax]
  1.1125 +        movdqa      xmm2,       [rax+16]
  1.1126 +
  1.1127 +        lea         rcx,        [rdi+rdx*8]
  1.1128 +        lea         rcx,        [rcx+rdx*8]
  1.1129 +        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
  1.1130 +
  1.1131 +        pxor        xmm0,       xmm0
  1.1132 +
  1.1133 +        ; get the first horizontal line done
  1.1134 +        movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  1.1135 +
  1.1136 +        add         rsi,        rax                 ; next line
  1.1137 +.next_row_spo:
  1.1138 +        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  1.1139 +
  1.1140 +        movdqa      xmm5,       xmm7
  1.1141 +        movdqa      xmm6,       xmm7
  1.1142 +
  1.1143 +        movdqa      xmm4,       xmm3                 ; make a copy of current line
  1.1144 +        movdqa      xmm7,       xmm3
  1.1145 +
  1.1146 +        punpcklbw   xmm5,       xmm0
  1.1147 +        punpckhbw   xmm6,       xmm0
  1.1148 +        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
  1.1149 +        punpckhbw   xmm4,       xmm0
  1.1150 +
  1.1151 +        pmullw      xmm5,       xmm1
  1.1152 +        pmullw      xmm6,       xmm1
  1.1153 +        pmullw      xmm3,       xmm2
  1.1154 +        pmullw      xmm4,       xmm2
  1.1155 +
  1.1156 +        paddw       xmm3,       xmm5
  1.1157 +        paddw       xmm4,       xmm6
  1.1158 +
  1.1159 +        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
  1.1160 +        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
  1.1161 +
  1.1162 +        paddw       xmm4,       [GLOBAL(rd)]
  1.1163 +        psraw       xmm4,       VP8_FILTER_SHIFT
  1.1164 +
  1.1165 +        packuswb    xmm3,       xmm4
  1.1166 +        movdqa      [rdi],      xmm3                 ; store the results in the destination
  1.1167 +
  1.1168 +        add         rsi,        rax                 ; next line
  1.1169 +        add         rdi,        rdx                 ;dst_pitch
  1.1170 +        cmp         rdi,        rcx
  1.1171 +        jne         .next_row_spo
  1.1172 +
  1.1173 +        jmp         .done
  1.1174 +
  1.1175 +.b16x16_fp_only:
  1.1176 +        lea         rcx,        [rdi+rdx*8]
  1.1177 +        lea         rcx,        [rcx+rdx*8]
  1.1178 +        movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
  1.1179 +        pxor        xmm0,       xmm0
  1.1180 +
  1.1181 +.next_row_fpo:
  1.1182 +        movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
  1.1183 +        movdqa      xmm4,       xmm3                 ; make a copy of current line
  1.1184 +
  1.1185 +        punpcklbw   xmm3,       xmm0                 ; xx 00 01 02 03 04 05 06
  1.1186 +        punpckhbw   xmm4,       xmm0
  1.1187 +
  1.1188 +        pmullw      xmm3,       xmm1
  1.1189 +        pmullw      xmm4,       xmm1
  1.1190 +
  1.1191 +        movdqu      xmm5,       [rsi+1]
  1.1192 +        movdqa      xmm6,       xmm5
  1.1193 +
  1.1194 +        punpcklbw   xmm5,       xmm0
  1.1195 +        punpckhbw   xmm6,       xmm0
  1.1196 +
  1.1197 +        pmullw      xmm5,       xmm2
  1.1198 +        pmullw      xmm6,       xmm2
  1.1199 +
  1.1200 +        paddw       xmm3,       xmm5
  1.1201 +        paddw       xmm4,       xmm6
  1.1202 +
  1.1203 +        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
  1.1204 +        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
  1.1205 +
  1.1206 +        paddw       xmm4,       [GLOBAL(rd)]
  1.1207 +        psraw       xmm4,       VP8_FILTER_SHIFT
  1.1208 +
  1.1209 +        packuswb    xmm3,       xmm4
  1.1210 +        movdqa      [rdi],      xmm3                 ; store the results in the destination
  1.1211 +
  1.1212 +        add         rsi,        rax                 ; next line
  1.1213 +        add         rdi,        rdx                 ; dst_pitch
  1.1214 +        cmp         rdi,        rcx
  1.1215 +        jne         .next_row_fpo
  1.1216 +
  1.1217 +.done:
  1.1218 +    ; begin epilog
  1.1219 +    pop rdi
  1.1220 +    pop rsi
  1.1221 +    RESTORE_GOT
  1.1222 +    RESTORE_XMM
  1.1223 +    UNSHADOW_ARGS
  1.1224 +    pop         rbp
  1.1225 +    ret
  1.1226 +
  1.1227 +
  1.1228 +;void vp8_bilinear_predict8x8_sse2
  1.1229 +;(
  1.1230 +;    unsigned char  *src_ptr,
  1.1231 +;    int   src_pixels_per_line,
  1.1232 +;    int  xoffset,
  1.1233 +;    int  yoffset,
  1.1234 +;    unsigned char *dst_ptr,
  1.1235 +;    int dst_pitch
  1.1236 +;)
  1.1237 +global sym(vp8_bilinear_predict8x8_sse2) PRIVATE
  1.1238 +sym(vp8_bilinear_predict8x8_sse2):
  1.1239 +    push        rbp
  1.1240 +    mov         rbp, rsp
  1.1241 +    SHADOW_ARGS_TO_STACK 6
  1.1242 +    SAVE_XMM 7
  1.1243 +    GET_GOT     rbx
  1.1244 +    push        rsi
  1.1245 +    push        rdi
  1.1246 +    ; end prolog
  1.1247 +
  1.1248 +    ALIGN_STACK 16, rax
  1.1249 +    sub         rsp, 144                         ; reserve 144 bytes
  1.1250 +
  1.1251 +    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
  1.1252 +    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
  1.1253 +        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
  1.1254 +
  1.1255 +        mov         rsi,        arg(0) ;src_ptr
  1.1256 +        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
  1.1257 +
  1.1258 +    ;Read 9-line unaligned data in and put them on stack. This gives a big
  1.1259 +    ;performance boost.
  1.1260 +        movdqu      xmm0,       [rsi]
  1.1261 +        lea         rax,        [rdx + rdx*2]
  1.1262 +        movdqu      xmm1,       [rsi+rdx]
  1.1263 +        movdqu      xmm2,       [rsi+rdx*2]
  1.1264 +        add         rsi,        rax
  1.1265 +        movdqu      xmm3,       [rsi]
  1.1266 +        movdqu      xmm4,       [rsi+rdx]
  1.1267 +        movdqu      xmm5,       [rsi+rdx*2]
  1.1268 +        add         rsi,        rax
  1.1269 +        movdqu      xmm6,       [rsi]
  1.1270 +        movdqu      xmm7,       [rsi+rdx]
  1.1271 +
  1.1272 +        movdqa      XMMWORD PTR [rsp],            xmm0
  1.1273 +
  1.1274 +        movdqu      xmm0,       [rsi+rdx*2]
  1.1275 +
  1.1276 +        movdqa      XMMWORD PTR [rsp+16],         xmm1
  1.1277 +        movdqa      XMMWORD PTR [rsp+32],         xmm2
  1.1278 +        movdqa      XMMWORD PTR [rsp+48],         xmm3
  1.1279 +        movdqa      XMMWORD PTR [rsp+64],         xmm4
  1.1280 +        movdqa      XMMWORD PTR [rsp+80],         xmm5
  1.1281 +        movdqa      XMMWORD PTR [rsp+96],         xmm6
  1.1282 +        movdqa      XMMWORD PTR [rsp+112],        xmm7
  1.1283 +        movdqa      XMMWORD PTR [rsp+128],        xmm0
  1.1284 +
  1.1285 +        movsxd      rax,        dword ptr arg(2) ;xoffset
  1.1286 +        shl         rax,        5
  1.1287 +        add         rax,        rcx    ;HFilter
  1.1288 +
  1.1289 +        mov         rdi,        arg(4) ;dst_ptr
  1.1290 +        movsxd      rdx,        dword ptr arg(5) ;dst_pitch
  1.1291 +
  1.1292 +        movdqa      xmm1,       [rax]
  1.1293 +        movdqa      xmm2,       [rax+16]
  1.1294 +
  1.1295 +        movsxd      rax,        dword ptr arg(3) ;yoffset
  1.1296 +        shl         rax,        5
  1.1297 +        add         rax,        rcx    ;VFilter
  1.1298 +
  1.1299 +        lea         rcx,        [rdi+rdx*8]
  1.1300 +
  1.1301 +        movdqa      xmm5,       [rax]
  1.1302 +        movdqa      xmm6,       [rax+16]
  1.1303 +
  1.1304 +        pxor        xmm0,       xmm0
  1.1305 +
  1.1306 +        ; get the first horizontal line done
  1.1307 +        movdqa      xmm3,       XMMWORD PTR [rsp]
  1.1308 +        movdqa      xmm4,       xmm3                 ; make a copy of current line
  1.1309 +        psrldq      xmm4,       1
  1.1310 +
  1.1311 +        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
  1.1312 +        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
  1.1313 +
  1.1314 +        pmullw      xmm3,       xmm1
  1.1315 +        pmullw      xmm4,       xmm2
  1.1316 +
  1.1317 +        paddw       xmm3,       xmm4
  1.1318 +
  1.1319 +        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
  1.1320 +        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
  1.1321 +
  1.1322 +        movdqa      xmm7,       xmm3
  1.1323 +        add         rsp,        16                 ; next line
  1.1324 +.next_row8x8:
  1.1325 +        movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
  1.1326 +        movdqa      xmm4,       xmm3                 ; make a copy of current line
  1.1327 +        psrldq      xmm4,       1
  1.1328 +
  1.1329 +        punpcklbw   xmm3,       xmm0                 ; 00 01 02 03 04 05 06 07
  1.1330 +        punpcklbw   xmm4,       xmm0                 ; 01 02 03 04 05 06 07 08
  1.1331 +
  1.1332 +        pmullw      xmm3,       xmm1
  1.1333 +        pmullw      xmm4,       xmm2
  1.1334 +
  1.1335 +        paddw       xmm3,       xmm4
  1.1336 +        pmullw      xmm7,       xmm5
  1.1337 +
  1.1338 +        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
  1.1339 +        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
  1.1340 +
  1.1341 +        movdqa      xmm4,       xmm3
  1.1342 +
  1.1343 +        pmullw      xmm3,       xmm6
  1.1344 +        paddw       xmm3,       xmm7
  1.1345 +
  1.1346 +        movdqa      xmm7,       xmm4
  1.1347 +
  1.1348 +        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
  1.1349 +        psraw       xmm3,       VP8_FILTER_SHIFT        ; xmm3 /= 128
  1.1350 +
  1.1351 +        packuswb    xmm3,       xmm0
  1.1352 +        movq        [rdi],      xmm3                 ; store the results in the destination
  1.1353 +
  1.1354 +        add         rsp,        16                 ; next line
  1.1355 +        add         rdi,        rdx
  1.1356 +
  1.1357 +        cmp         rdi,        rcx
  1.1358 +        jne         .next_row8x8
  1.1359 +
  1.1360 +    ;add rsp, 144
  1.1361 +    pop rsp
  1.1362 +    ; begin epilog
  1.1363 +    pop rdi
  1.1364 +    pop rsi
  1.1365 +    RESTORE_GOT
  1.1366 +    RESTORE_XMM
  1.1367 +    UNSHADOW_ARGS
  1.1368 +    pop         rbp
  1.1369 +    ret
  1.1370 +
  1.1371 +
  1.1372 +SECTION_RODATA
  1.1373 +align 16
  1.1374 +rd:
  1.1375 +    times 8 dw 0x40

mercurial