media/libvpx/vp8/common/x86/subpixel_ssse3.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/common/x86/subpixel_ssse3.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1508 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +
    1.15 +%include "vpx_ports/x86_abi_support.asm"
    1.16 +
    1.17 +%define BLOCK_HEIGHT_WIDTH 4
    1.18 +%define VP8_FILTER_WEIGHT 128
    1.19 +%define VP8_FILTER_SHIFT  7
    1.20 +
    1.21 +
    1.22 +;/************************************************************************************
    1.23 +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
    1.24 +; input pixel array has output_height rows. This routine assumes that output_height is an
    1.25 +; even number. This function handles 8 pixels in horizontal direction, calculating ONE
    1.26 +; rows each iteration to take advantage of the 128 bits operations.
    1.27 +;
    1.28 +; This is an implementation of some of the SSE optimizations first seen in ffvp8
    1.29 +;
    1.30 +;*************************************************************************************/
    1.31 +;void vp8_filter_block1d8_h6_ssse3
    1.32 +;(
    1.33 +;    unsigned char  *src_ptr,
    1.34 +;    unsigned int    src_pixels_per_line,
    1.35 +;    unsigned char *output_ptr,
    1.36 +;    unsigned int    output_pitch,
    1.37 +;    unsigned int    output_height,
    1.38 +;    unsigned int    vp8_filter_index
    1.39 +;)
    1.40 +global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE
    1.41 +sym(vp8_filter_block1d8_h6_ssse3):
    1.42 +    push        rbp
    1.43 +    mov         rbp, rsp
    1.44 +    SHADOW_ARGS_TO_STACK 6
    1.45 +    SAVE_XMM 7
    1.46 +    GET_GOT     rbx
    1.47 +    push        rsi
    1.48 +    push        rdi
    1.49 +    ; end prolog
    1.50 +
    1.51 +    movsxd      rdx, DWORD PTR arg(5)   ;table index
    1.52 +    xor         rsi, rsi
    1.53 +    shl         rdx, 4
    1.54 +
    1.55 +    movdqa      xmm7, [GLOBAL(rd)]
    1.56 +
    1.57 +    lea         rax, [GLOBAL(k0_k5)]
    1.58 +    add         rax, rdx
    1.59 +    mov         rdi, arg(2)             ;output_ptr
    1.60 +
    1.61 +    cmp         esi, DWORD PTR [rax]
    1.62 +    je          vp8_filter_block1d8_h4_ssse3
    1.63 +
    1.64 +    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
    1.65 +    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
    1.66 +    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
    1.67 +
    1.68 +    mov         rsi, arg(0)             ;src_ptr
    1.69 +    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
    1.70 +    movsxd      rcx, dword ptr arg(4)   ;output_height
    1.71 +
    1.72 +    movsxd      rdx, dword ptr arg(3)   ;output_pitch
    1.73 +
    1.74 +    sub         rdi, rdx
    1.75 +;xmm3 free
    1.76 +.filter_block1d8_h6_rowloop_ssse3:
    1.77 +    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
    1.78 +
    1.79 +    movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
    1.80 +
    1.81 +    punpcklbw   xmm0,   xmm2                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
    1.82 +
    1.83 +    movdqa      xmm1,   xmm0
    1.84 +    pmaddubsw   xmm0,   xmm4
    1.85 +
    1.86 +    movdqa      xmm2,   xmm1
    1.87 +    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
    1.88 +
    1.89 +    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
    1.90 +    pmaddubsw   xmm1,   xmm5
    1.91 +
    1.92 +    lea         rdi,    [rdi + rdx]
    1.93 +    pmaddubsw   xmm2,   xmm6
    1.94 +
    1.95 +    lea         rsi,    [rsi + rax]
    1.96 +    dec         rcx
    1.97 +
    1.98 +    paddsw      xmm0,   xmm1
    1.99 +    paddsw      xmm2,   xmm7
   1.100 +
   1.101 +    paddsw      xmm0,   xmm2
   1.102 +
   1.103 +    psraw       xmm0,   7
   1.104 +
   1.105 +    packuswb    xmm0,   xmm0
   1.106 +
   1.107 +    movq        MMWORD Ptr [rdi], xmm0
   1.108 +    jnz         .filter_block1d8_h6_rowloop_ssse3
   1.109 +
   1.110 +    ; begin epilog
   1.111 +    pop rdi
   1.112 +    pop rsi
   1.113 +    RESTORE_GOT
   1.114 +    RESTORE_XMM
   1.115 +    UNSHADOW_ARGS
   1.116 +    pop         rbp
   1.117 +    ret
   1.118 +
   1.119 +vp8_filter_block1d8_h4_ssse3:
   1.120 +    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
   1.121 +    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
   1.122 +
   1.123 +    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
   1.124 +    movdqa      xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
   1.125 +
   1.126 +    mov         rsi, arg(0)             ;src_ptr
   1.127 +
   1.128 +    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
   1.129 +    movsxd      rcx, dword ptr arg(4)   ;output_height
   1.130 +
   1.131 +    movsxd      rdx, dword ptr arg(3)   ;output_pitch
   1.132 +
   1.133 +    sub         rdi, rdx
   1.134 +
   1.135 +.filter_block1d8_h4_rowloop_ssse3:
   1.136 +    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
   1.137 +
   1.138 +    movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
   1.139 +
   1.140 +    punpcklbw   xmm0,   xmm1                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
   1.141 +
   1.142 +    movdqa      xmm2,   xmm0
   1.143 +    pshufb      xmm0,   xmm3
   1.144 +
   1.145 +    pshufb      xmm2,   xmm4
   1.146 +    pmaddubsw   xmm0,   xmm5
   1.147 +
   1.148 +    lea         rdi,    [rdi + rdx]
   1.149 +    pmaddubsw   xmm2,   xmm6
   1.150 +
   1.151 +    lea         rsi,    [rsi + rax]
   1.152 +    dec         rcx
   1.153 +
   1.154 +    paddsw      xmm0,   xmm7
   1.155 +
   1.156 +    paddsw      xmm0,   xmm2
   1.157 +
   1.158 +    psraw       xmm0,   7
   1.159 +
   1.160 +    packuswb    xmm0,   xmm0
   1.161 +
   1.162 +    movq        MMWORD Ptr [rdi], xmm0
   1.163 +
   1.164 +    jnz         .filter_block1d8_h4_rowloop_ssse3
   1.165 +
   1.166 +    ; begin epilog
   1.167 +    pop rdi
   1.168 +    pop rsi
   1.169 +    RESTORE_GOT
   1.170 +    RESTORE_XMM
   1.171 +    UNSHADOW_ARGS
   1.172 +    pop         rbp
   1.173 +    ret
   1.174 +;void vp8_filter_block1d16_h6_ssse3
   1.175 +;(
   1.176 +;    unsigned char  *src_ptr,
   1.177 +;    unsigned int    src_pixels_per_line,
   1.178 +;    unsigned char  *output_ptr,
   1.179 +;    unsigned int    output_pitch,
   1.180 +;    unsigned int    output_height,
   1.181 +;    unsigned int    vp8_filter_index
   1.182 +;)
   1.183 +global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE
   1.184 +sym(vp8_filter_block1d16_h6_ssse3):
   1.185 +    push        rbp
   1.186 +    mov         rbp, rsp
   1.187 +    SHADOW_ARGS_TO_STACK 6
   1.188 +    SAVE_XMM 7
   1.189 +    GET_GOT     rbx
   1.190 +    push        rsi
   1.191 +    push        rdi
   1.192 +    ; end prolog
   1.193 +
   1.194 +    movsxd      rdx, DWORD PTR arg(5)           ;table index
   1.195 +    xor         rsi, rsi
   1.196 +    shl         rdx, 4      ;
   1.197 +
   1.198 +    lea         rax, [GLOBAL(k0_k5)]
   1.199 +    add         rax, rdx
   1.200 +
   1.201 +    mov         rdi, arg(2)                     ;output_ptr
   1.202 +
   1.203 +    mov         rsi, arg(0)                     ;src_ptr
   1.204 +
   1.205 +    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
   1.206 +    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
   1.207 +    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
   1.208 +
   1.209 +    movsxd      rax, dword ptr arg(1)           ;src_pixels_per_line
   1.210 +    movsxd      rcx, dword ptr arg(4)           ;output_height
   1.211 +    movsxd      rdx, dword ptr arg(3)           ;output_pitch
   1.212 +
   1.213 +.filter_block1d16_h6_rowloop_ssse3:
   1.214 +    movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
   1.215 +
   1.216 +    movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
   1.217 +
   1.218 +    punpcklbw   xmm0,   xmm3                    ; -2  3 -1  4  0  5  1  6  2  7  3  8  4  9  5 10
   1.219 +
   1.220 +    movdqa      xmm1,   xmm0
   1.221 +    pmaddubsw   xmm0,   xmm4
   1.222 +
   1.223 +    movdqa      xmm2,   xmm1
   1.224 +    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
   1.225 +
   1.226 +    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
   1.227 +    movq        xmm3,   MMWORD PTR [rsi +  6]
   1.228 +
   1.229 +    pmaddubsw   xmm1,   xmm5
   1.230 +    movq        xmm7,   MMWORD PTR [rsi + 11]
   1.231 +
   1.232 +    pmaddubsw   xmm2,   xmm6
   1.233 +    punpcklbw   xmm3,   xmm7
   1.234 +
   1.235 +    paddsw      xmm0,   xmm1
   1.236 +    movdqa      xmm1,   xmm3
   1.237 +
   1.238 +    pmaddubsw   xmm3,   xmm4
   1.239 +    paddsw      xmm0,   xmm2
   1.240 +
   1.241 +    movdqa      xmm2,   xmm1
   1.242 +    paddsw      xmm0,   [GLOBAL(rd)]
   1.243 +
   1.244 +    pshufb      xmm1,   [GLOBAL(shuf2bfrom1)]
   1.245 +    pshufb      xmm2,   [GLOBAL(shuf3bfrom1)]
   1.246 +
   1.247 +    psraw       xmm0,   7
   1.248 +    pmaddubsw   xmm1,   xmm5
   1.249 +
   1.250 +    pmaddubsw   xmm2,   xmm6
   1.251 +    packuswb    xmm0,   xmm0
   1.252 +
   1.253 +    lea         rsi,    [rsi + rax]
   1.254 +    paddsw      xmm3,   xmm1
   1.255 +
   1.256 +    paddsw      xmm3,   xmm2
   1.257 +
   1.258 +    paddsw      xmm3,   [GLOBAL(rd)]
   1.259 +
   1.260 +    psraw       xmm3,   7
   1.261 +
   1.262 +    packuswb    xmm3,   xmm3
   1.263 +
   1.264 +    punpcklqdq  xmm0,   xmm3
   1.265 +
   1.266 +    movdqa      XMMWORD Ptr [rdi], xmm0
   1.267 +
   1.268 +    lea         rdi,    [rdi + rdx]
   1.269 +    dec         rcx
   1.270 +    jnz         .filter_block1d16_h6_rowloop_ssse3
   1.271 +
   1.272 +    ; begin epilog
   1.273 +    pop rdi
   1.274 +    pop rsi
   1.275 +    RESTORE_GOT
   1.276 +    RESTORE_XMM
   1.277 +    UNSHADOW_ARGS
   1.278 +    pop         rbp
   1.279 +    ret
   1.280 +
   1.281 +;void vp8_filter_block1d4_h6_ssse3
   1.282 +;(
   1.283 +;    unsigned char  *src_ptr,
   1.284 +;    unsigned int    src_pixels_per_line,
   1.285 +;    unsigned char  *output_ptr,
   1.286 +;    unsigned int    output_pitch,
   1.287 +;    unsigned int    output_height,
   1.288 +;    unsigned int    vp8_filter_index
   1.289 +;)
   1.290 +global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE
   1.291 +sym(vp8_filter_block1d4_h6_ssse3):
   1.292 +    push        rbp
   1.293 +    mov         rbp, rsp
   1.294 +    SHADOW_ARGS_TO_STACK 6
   1.295 +    SAVE_XMM 7
   1.296 +    GET_GOT     rbx
   1.297 +    push        rsi
   1.298 +    push        rdi
   1.299 +    ; end prolog
   1.300 +
   1.301 +    movsxd      rdx, DWORD PTR arg(5)   ;table index
   1.302 +    xor         rsi, rsi
   1.303 +    shl         rdx, 4      ;
   1.304 +
   1.305 +    lea         rax, [GLOBAL(k0_k5)]
   1.306 +    add         rax, rdx
   1.307 +    movdqa      xmm7, [GLOBAL(rd)]
   1.308 +
   1.309 +    cmp         esi, DWORD PTR [rax]
   1.310 +    je          .vp8_filter_block1d4_h4_ssse3
   1.311 +
   1.312 +    movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
   1.313 +    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
   1.314 +    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
   1.315 +
   1.316 +    mov         rsi, arg(0)             ;src_ptr
   1.317 +    mov         rdi, arg(2)             ;output_ptr
   1.318 +    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
   1.319 +    movsxd      rcx, dword ptr arg(4)   ;output_height
   1.320 +
   1.321 +    movsxd      rdx, dword ptr arg(3)   ;output_pitch
   1.322 +
   1.323 +;xmm3 free
   1.324 +.filter_block1d4_h6_rowloop_ssse3:
   1.325 +    movdqu      xmm0,   XMMWORD PTR [rsi - 2]
   1.326 +
   1.327 +    movdqa      xmm1, xmm0
   1.328 +    pshufb      xmm0, [GLOBAL(shuf1b)]
   1.329 +
   1.330 +    movdqa      xmm2, xmm1
   1.331 +    pshufb      xmm1, [GLOBAL(shuf2b)]
   1.332 +    pmaddubsw   xmm0, xmm4
   1.333 +    pshufb      xmm2, [GLOBAL(shuf3b)]
   1.334 +    pmaddubsw   xmm1, xmm5
   1.335 +
   1.336 +;--
   1.337 +    pmaddubsw   xmm2, xmm6
   1.338 +
   1.339 +    lea         rsi,    [rsi + rax]
   1.340 +;--
   1.341 +    paddsw      xmm0, xmm1
   1.342 +    paddsw      xmm0, xmm7
   1.343 +    pxor        xmm1, xmm1
   1.344 +    paddsw      xmm0, xmm2
   1.345 +    psraw       xmm0, 7
   1.346 +    packuswb    xmm0, xmm0
   1.347 +
   1.348 +    movd        DWORD PTR [rdi], xmm0
   1.349 +
   1.350 +    add         rdi, rdx
   1.351 +    dec         rcx
   1.352 +    jnz         .filter_block1d4_h6_rowloop_ssse3
   1.353 +
   1.354 +    ; begin epilog
   1.355 +    pop rdi
   1.356 +    pop rsi
   1.357 +    RESTORE_GOT
   1.358 +    RESTORE_XMM
   1.359 +    UNSHADOW_ARGS
   1.360 +    pop         rbp
   1.361 +    ret
   1.362 +
   1.363 +.vp8_filter_block1d4_h4_ssse3:
   1.364 +    movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
   1.365 +    movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
   1.366 +    movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
   1.367 +    movdqa      xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
   1.368 +
   1.369 +    mov         rsi, arg(0)             ;src_ptr
   1.370 +    mov         rdi, arg(2)             ;output_ptr
   1.371 +    movsxd      rax, dword ptr arg(1)   ;src_pixels_per_line
   1.372 +    movsxd      rcx, dword ptr arg(4)   ;output_height
   1.373 +
   1.374 +    movsxd      rdx, dword ptr arg(3)   ;output_pitch
   1.375 +
   1.376 +.filter_block1d4_h4_rowloop_ssse3:
   1.377 +    movdqu      xmm1,   XMMWORD PTR [rsi - 2]
   1.378 +
   1.379 +    movdqa      xmm2, xmm1
   1.380 +    pshufb      xmm1, xmm0 ;;[GLOBAL(shuf2b)]
   1.381 +    pshufb      xmm2, xmm3 ;;[GLOBAL(shuf3b)]
   1.382 +    pmaddubsw   xmm1, xmm5
   1.383 +
   1.384 +;--
   1.385 +    pmaddubsw   xmm2, xmm6
   1.386 +
   1.387 +    lea         rsi,    [rsi + rax]
   1.388 +;--
   1.389 +    paddsw      xmm1, xmm7
   1.390 +    paddsw      xmm1, xmm2
   1.391 +    psraw       xmm1, 7
   1.392 +    packuswb    xmm1, xmm1
   1.393 +
   1.394 +    movd        DWORD PTR [rdi], xmm1
   1.395 +
   1.396 +    add         rdi, rdx
   1.397 +    dec         rcx
   1.398 +    jnz         .filter_block1d4_h4_rowloop_ssse3
   1.399 +
   1.400 +    ; begin epilog
   1.401 +    pop rdi
   1.402 +    pop rsi
   1.403 +    RESTORE_GOT
   1.404 +    RESTORE_XMM
   1.405 +    UNSHADOW_ARGS
   1.406 +    pop         rbp
   1.407 +    ret
   1.408 +
   1.409 +
   1.410 +
   1.411 +;void vp8_filter_block1d16_v6_ssse3
   1.412 +;(
   1.413 +;    unsigned char *src_ptr,
   1.414 +;    unsigned int   src_pitch,
   1.415 +;    unsigned char *output_ptr,
   1.416 +;    unsigned int   out_pitch,
   1.417 +;    unsigned int   output_height,
   1.418 +;    unsigned int   vp8_filter_index
   1.419 +;)
   1.420 +global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE
   1.421 +sym(vp8_filter_block1d16_v6_ssse3):
   1.422 +    push        rbp
   1.423 +    mov         rbp, rsp
   1.424 +    SHADOW_ARGS_TO_STACK 6
   1.425 +    SAVE_XMM 7
   1.426 +    GET_GOT     rbx
   1.427 +    push        rsi
   1.428 +    push        rdi
   1.429 +    ; end prolog
   1.430 +
   1.431 +    movsxd      rdx, DWORD PTR arg(5)   ;table index
   1.432 +    xor         rsi, rsi
   1.433 +    shl         rdx, 4      ;
   1.434 +
   1.435 +    lea         rax, [GLOBAL(k0_k5)]
   1.436 +    add         rax, rdx
   1.437 +
   1.438 +    cmp         esi, DWORD PTR [rax]
   1.439 +    je          .vp8_filter_block1d16_v4_ssse3
   1.440 +
   1.441 +    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
   1.442 +    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
   1.443 +    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
   1.444 +
   1.445 +    mov         rsi, arg(0)             ;src_ptr
   1.446 +    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
   1.447 +    mov         rdi, arg(2)             ;output_ptr
   1.448 +
   1.449 +%if ABI_IS_32BIT=0
   1.450 +    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
   1.451 +%endif
   1.452 +    mov         rax, rsi
   1.453 +    movsxd      rcx, DWORD PTR arg(4)   ;output_height
   1.454 +    add         rax, rdx
   1.455 +
   1.456 +
   1.457 +.vp8_filter_block1d16_v6_ssse3_loop:
   1.458 +    movq        xmm1, MMWORD PTR [rsi]                  ;A
   1.459 +    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
   1.460 +    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
   1.461 +    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
   1.462 +    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
   1.463 +
   1.464 +    punpcklbw   xmm2, xmm4                  ;B D
   1.465 +    punpcklbw   xmm3, xmm0                  ;C E
   1.466 +
   1.467 +    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
   1.468 +
   1.469 +    pmaddubsw   xmm3, xmm6
   1.470 +    punpcklbw   xmm1, xmm0                  ;A F
   1.471 +    pmaddubsw   xmm2, xmm7
   1.472 +    pmaddubsw   xmm1, xmm5
   1.473 +
   1.474 +    paddsw      xmm2, xmm3
   1.475 +    paddsw      xmm2, xmm1
   1.476 +    paddsw      xmm2, [GLOBAL(rd)]
   1.477 +    psraw       xmm2, 7
   1.478 +    packuswb    xmm2, xmm2
   1.479 +
   1.480 +    movq        MMWORD PTR [rdi], xmm2          ;store the results
   1.481 +
   1.482 +    movq        xmm1, MMWORD PTR [rsi + 8]                  ;A
   1.483 +    movq        xmm2, MMWORD PTR [rsi + rdx + 8]            ;B
   1.484 +    movq        xmm3, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
   1.485 +    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
   1.486 +    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
   1.487 +
   1.488 +    punpcklbw   xmm2, xmm4                  ;B D
   1.489 +    punpcklbw   xmm3, xmm0                  ;C E
   1.490 +
   1.491 +    movq        xmm0, MMWORD PTR [rax + rdx * 4 + 8]        ;F
   1.492 +    pmaddubsw   xmm3, xmm6
   1.493 +    punpcklbw   xmm1, xmm0                  ;A F
   1.494 +    pmaddubsw   xmm2, xmm7
   1.495 +    pmaddubsw   xmm1, xmm5
   1.496 +
   1.497 +    add         rsi,  rdx
   1.498 +    add         rax,  rdx
   1.499 +;--
   1.500 +;--
   1.501 +    paddsw      xmm2, xmm3
   1.502 +    paddsw      xmm2, xmm1
   1.503 +    paddsw      xmm2, [GLOBAL(rd)]
   1.504 +    psraw       xmm2, 7
   1.505 +    packuswb    xmm2, xmm2
   1.506 +
   1.507 +    movq        MMWORD PTR [rdi+8], xmm2
   1.508 +
   1.509 +%if ABI_IS_32BIT
   1.510 +    add         rdi,        DWORD PTR arg(3) ;out_pitch
   1.511 +%else
   1.512 +    add         rdi,        r8
   1.513 +%endif
   1.514 +    dec         rcx
   1.515 +    jnz         .vp8_filter_block1d16_v6_ssse3_loop
   1.516 +
   1.517 +    ; begin epilog
   1.518 +    pop rdi
   1.519 +    pop rsi
   1.520 +    RESTORE_GOT
   1.521 +    RESTORE_XMM
   1.522 +    UNSHADOW_ARGS
   1.523 +    pop         rbp
   1.524 +    ret
   1.525 +
   1.526 +.vp8_filter_block1d16_v4_ssse3:
   1.527 +    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
   1.528 +    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
   1.529 +
   1.530 +    mov         rsi, arg(0)             ;src_ptr
   1.531 +    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
   1.532 +    mov         rdi, arg(2)             ;output_ptr
   1.533 +
   1.534 +%if ABI_IS_32BIT=0
   1.535 +    movsxd      r8, DWORD PTR arg(3)    ;out_pitch
   1.536 +%endif
   1.537 +    mov         rax, rsi
   1.538 +    movsxd      rcx, DWORD PTR arg(4)   ;output_height
   1.539 +    add         rax, rdx
   1.540 +
   1.541 +.vp8_filter_block1d16_v4_ssse3_loop:
   1.542 +    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
   1.543 +    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
   1.544 +    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
   1.545 +    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
   1.546 +
   1.547 +    punpcklbw   xmm2, xmm4                  ;B D
   1.548 +    punpcklbw   xmm3, xmm0                  ;C E
   1.549 +
   1.550 +    pmaddubsw   xmm3, xmm6
   1.551 +    pmaddubsw   xmm2, xmm7
   1.552 +    movq        xmm5, MMWORD PTR [rsi + rdx + 8]            ;B
   1.553 +    movq        xmm1, MMWORD PTR [rsi + rdx * 2 + 8]        ;C
   1.554 +    movq        xmm4, MMWORD PTR [rax + rdx * 2 + 8]        ;D
   1.555 +    movq        xmm0, MMWORD PTR [rsi + rdx * 4 + 8]        ;E
   1.556 +
   1.557 +    paddsw      xmm2, [GLOBAL(rd)]
   1.558 +    paddsw      xmm2, xmm3
   1.559 +    psraw       xmm2, 7
   1.560 +    packuswb    xmm2, xmm2
   1.561 +
   1.562 +    punpcklbw   xmm5, xmm4                  ;B D
   1.563 +    punpcklbw   xmm1, xmm0                  ;C E
   1.564 +
   1.565 +    pmaddubsw   xmm1, xmm6
   1.566 +    pmaddubsw   xmm5, xmm7
   1.567 +
   1.568 +    movdqa      xmm4, [GLOBAL(rd)]
   1.569 +    add         rsi,  rdx
   1.570 +    add         rax,  rdx
   1.571 +;--
   1.572 +;--
   1.573 +    paddsw      xmm5, xmm1
   1.574 +    paddsw      xmm5, xmm4
   1.575 +    psraw       xmm5, 7
   1.576 +    packuswb    xmm5, xmm5
   1.577 +
   1.578 +    punpcklqdq  xmm2, xmm5
   1.579 +
   1.580 +    movdqa       XMMWORD PTR [rdi], xmm2
   1.581 +
   1.582 +%if ABI_IS_32BIT
   1.583 +    add         rdi,        DWORD PTR arg(3) ;out_pitch
   1.584 +%else
   1.585 +    add         rdi,        r8
   1.586 +%endif
   1.587 +    dec         rcx
   1.588 +    jnz         .vp8_filter_block1d16_v4_ssse3_loop
   1.589 +
   1.590 +    ; begin epilog
   1.591 +    pop rdi
   1.592 +    pop rsi
   1.593 +    RESTORE_GOT
   1.594 +    RESTORE_XMM
   1.595 +    UNSHADOW_ARGS
   1.596 +    pop         rbp
   1.597 +    ret
   1.598 +
   1.599 +;void vp8_filter_block1d8_v6_ssse3
   1.600 +;(
   1.601 +;    unsigned char *src_ptr,
   1.602 +;    unsigned int   src_pitch,
   1.603 +;    unsigned char *output_ptr,
   1.604 +;    unsigned int   out_pitch,
   1.605 +;    unsigned int   output_height,
   1.606 +;    unsigned int   vp8_filter_index
   1.607 +;)
   1.608 +global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE
   1.609 +sym(vp8_filter_block1d8_v6_ssse3):
   1.610 +    push        rbp
   1.611 +    mov         rbp, rsp
   1.612 +    SHADOW_ARGS_TO_STACK 6
   1.613 +    SAVE_XMM 7
   1.614 +    GET_GOT     rbx
   1.615 +    push        rsi
   1.616 +    push        rdi
   1.617 +    ; end prolog
   1.618 +
   1.619 +    movsxd      rdx, DWORD PTR arg(5)   ;table index
   1.620 +    xor         rsi, rsi
   1.621 +    shl         rdx, 4      ;
   1.622 +
   1.623 +    lea         rax, [GLOBAL(k0_k5)]
   1.624 +    add         rax, rdx
   1.625 +
   1.626 +    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
   1.627 +    mov         rdi, arg(2)             ;output_ptr
   1.628 +%if ABI_IS_32BIT=0
   1.629 +    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
   1.630 +%endif
   1.631 +    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
   1.632 +
   1.633 +    cmp         esi, DWORD PTR [rax]
   1.634 +    je          .vp8_filter_block1d8_v4_ssse3
   1.635 +
   1.636 +    movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
   1.637 +    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
   1.638 +    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
   1.639 +
   1.640 +    mov         rsi, arg(0)             ;src_ptr
   1.641 +
   1.642 +    mov         rax, rsi
   1.643 +    add         rax, rdx
   1.644 +
   1.645 +.vp8_filter_block1d8_v6_ssse3_loop:
   1.646 +    movq        xmm1, MMWORD PTR [rsi]                  ;A
   1.647 +    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
   1.648 +    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
   1.649 +    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
   1.650 +    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
   1.651 +
   1.652 +    punpcklbw   xmm2, xmm4                  ;B D
   1.653 +    punpcklbw   xmm3, xmm0                  ;C E
   1.654 +
   1.655 +    movq        xmm0, MMWORD PTR [rax + rdx * 4]        ;F
   1.656 +    movdqa      xmm4, [GLOBAL(rd)]
   1.657 +
   1.658 +    pmaddubsw   xmm3, xmm6
   1.659 +    punpcklbw   xmm1, xmm0                  ;A F
   1.660 +    pmaddubsw   xmm2, xmm7
   1.661 +    pmaddubsw   xmm1, xmm5
   1.662 +    add         rsi,  rdx
   1.663 +    add         rax,  rdx
   1.664 +;--
   1.665 +;--
   1.666 +    paddsw      xmm2, xmm3
   1.667 +    paddsw      xmm2, xmm1
   1.668 +    paddsw      xmm2, xmm4
   1.669 +    psraw       xmm2, 7
   1.670 +    packuswb    xmm2, xmm2
   1.671 +
   1.672 +    movq        MMWORD PTR [rdi], xmm2
   1.673 +
   1.674 +%if ABI_IS_32BIT
   1.675 +    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
   1.676 +%else
   1.677 +    add         rdi,        r8
   1.678 +%endif
   1.679 +    dec         rcx
   1.680 +    jnz         .vp8_filter_block1d8_v6_ssse3_loop
   1.681 +
   1.682 +    ; begin epilog
   1.683 +    pop rdi
   1.684 +    pop rsi
   1.685 +    RESTORE_GOT
   1.686 +    RESTORE_XMM
   1.687 +    UNSHADOW_ARGS
   1.688 +    pop         rbp
   1.689 +    ret
   1.690 +
   1.691 +.vp8_filter_block1d8_v4_ssse3:
   1.692 +    movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
   1.693 +    movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
   1.694 +    movdqa      xmm5, [GLOBAL(rd)]
   1.695 +
   1.696 +    mov         rsi, arg(0)             ;src_ptr
   1.697 +
   1.698 +    mov         rax, rsi
   1.699 +    add         rax, rdx
   1.700 +
   1.701 +.vp8_filter_block1d8_v4_ssse3_loop:
   1.702 +    movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
   1.703 +    movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
   1.704 +    movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
   1.705 +    movq        xmm0, MMWORD PTR [rsi + rdx * 4]        ;E
   1.706 +
   1.707 +    punpcklbw   xmm2, xmm4                  ;B D
   1.708 +    punpcklbw   xmm3, xmm0                  ;C E
   1.709 +
   1.710 +    pmaddubsw   xmm3, xmm6
   1.711 +    pmaddubsw   xmm2, xmm7
   1.712 +    add         rsi,  rdx
   1.713 +    add         rax,  rdx
   1.714 +;--
   1.715 +;--
   1.716 +    paddsw      xmm2, xmm3
   1.717 +    paddsw      xmm2, xmm5
   1.718 +    psraw       xmm2, 7
   1.719 +    packuswb    xmm2, xmm2
   1.720 +
   1.721 +    movq        MMWORD PTR [rdi], xmm2
   1.722 +
   1.723 +%if ABI_IS_32BIT
   1.724 +    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
   1.725 +%else
   1.726 +    add         rdi,        r8
   1.727 +%endif
   1.728 +    dec         rcx
   1.729 +    jnz         .vp8_filter_block1d8_v4_ssse3_loop
   1.730 +
   1.731 +    ; begin epilog
   1.732 +    pop rdi
   1.733 +    pop rsi
   1.734 +    RESTORE_GOT
   1.735 +    RESTORE_XMM
   1.736 +    UNSHADOW_ARGS
   1.737 +    pop         rbp
   1.738 +    ret
   1.739 +;void vp8_filter_block1d4_v6_ssse3
   1.740 +;(
   1.741 +;    unsigned char *src_ptr,
   1.742 +;    unsigned int   src_pitch,
   1.743 +;    unsigned char *output_ptr,
   1.744 +;    unsigned int   out_pitch,
   1.745 +;    unsigned int   output_height,
   1.746 +;    unsigned int   vp8_filter_index
   1.747 +;)
   1.748 +global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE
   1.749 +sym(vp8_filter_block1d4_v6_ssse3):
   1.750 +    push        rbp
   1.751 +    mov         rbp, rsp
   1.752 +    SHADOW_ARGS_TO_STACK 6
   1.753 +    GET_GOT     rbx
   1.754 +    push        rsi
   1.755 +    push        rdi
   1.756 +    ; end prolog
   1.757 +
   1.758 +    movsxd      rdx, DWORD PTR arg(5)   ;table index
   1.759 +    xor         rsi, rsi
   1.760 +    shl         rdx, 4      ;
   1.761 +
   1.762 +    lea         rax, [GLOBAL(k0_k5)]
   1.763 +    add         rax, rdx
   1.764 +
   1.765 +    movsxd      rdx, DWORD PTR arg(1)   ;pixels_per_line
   1.766 +    mov         rdi, arg(2)             ;output_ptr
   1.767 +%if ABI_IS_32BIT=0
   1.768 +    movsxd      r8, DWORD PTR arg(3)    ; out_pitch
   1.769 +%endif
   1.770 +    movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
   1.771 +
   1.772 +    cmp         esi, DWORD PTR [rax]
   1.773 +    je          .vp8_filter_block1d4_v4_ssse3
   1.774 +
   1.775 +    movq        mm5, MMWORD PTR [rax]         ;k0_k5
   1.776 +    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
   1.777 +    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
   1.778 +
   1.779 +    mov         rsi, arg(0)             ;src_ptr
   1.780 +
   1.781 +    mov         rax, rsi
   1.782 +    add         rax, rdx
   1.783 +
   1.784 +.vp8_filter_block1d4_v6_ssse3_loop:
   1.785 +    movd        mm1, DWORD PTR [rsi]                  ;A
   1.786 +    movd        mm2, DWORD PTR [rsi + rdx]            ;B
   1.787 +    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
   1.788 +    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
   1.789 +    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
   1.790 +
   1.791 +    punpcklbw   mm2, mm4                  ;B D
   1.792 +    punpcklbw   mm3, mm0                  ;C E
   1.793 +
   1.794 +    movd        mm0, DWORD PTR [rax + rdx * 4]        ;F
   1.795 +
   1.796 +    movq        mm4, [GLOBAL(rd)]
   1.797 +
   1.798 +    pmaddubsw   mm3, mm6
   1.799 +    punpcklbw   mm1, mm0                  ;A F
   1.800 +    pmaddubsw   mm2, mm7
   1.801 +    pmaddubsw   mm1, mm5
   1.802 +    add         rsi,  rdx
   1.803 +    add         rax,  rdx
   1.804 +;--
   1.805 +;--
   1.806 +    paddsw      mm2, mm3
   1.807 +    paddsw      mm2, mm1
   1.808 +    paddsw      mm2, mm4
   1.809 +    psraw       mm2, 7
   1.810 +    packuswb    mm2, mm2
   1.811 +
   1.812 +    movd        DWORD PTR [rdi], mm2
   1.813 +
   1.814 +%if ABI_IS_32BIT
   1.815 +    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
   1.816 +%else
   1.817 +    add         rdi,        r8
   1.818 +%endif
   1.819 +    dec         rcx
   1.820 +    jnz         .vp8_filter_block1d4_v6_ssse3_loop
   1.821 +
   1.822 +    ; begin epilog
   1.823 +    pop rdi
   1.824 +    pop rsi
   1.825 +    RESTORE_GOT
   1.826 +    UNSHADOW_ARGS
   1.827 +    pop         rbp
   1.828 +    ret
   1.829 +
   1.830 +.vp8_filter_block1d4_v4_ssse3:
   1.831 +    movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
   1.832 +    movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
   1.833 +    movq        mm5, MMWORD PTR [GLOBAL(rd)]
   1.834 +
   1.835 +    mov         rsi, arg(0)             ;src_ptr
   1.836 +
   1.837 +    mov         rax, rsi
   1.838 +    add         rax, rdx
   1.839 +
   1.840 +.vp8_filter_block1d4_v4_ssse3_loop:
   1.841 +    movd        mm2, DWORD PTR [rsi + rdx]            ;B
   1.842 +    movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
   1.843 +    movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
   1.844 +    movd        mm0, DWORD PTR [rsi + rdx * 4]        ;E
   1.845 +
   1.846 +    punpcklbw   mm2, mm4                  ;B D
   1.847 +    punpcklbw   mm3, mm0                  ;C E
   1.848 +
   1.849 +    pmaddubsw   mm3, mm6
   1.850 +    pmaddubsw   mm2, mm7
   1.851 +    add         rsi,  rdx
   1.852 +    add         rax,  rdx
   1.853 +;--
   1.854 +;--
   1.855 +    paddsw      mm2, mm3
   1.856 +    paddsw      mm2, mm5
   1.857 +    psraw       mm2, 7
   1.858 +    packuswb    mm2, mm2
   1.859 +
   1.860 +    movd        DWORD PTR [rdi], mm2
   1.861 +
   1.862 +%if ABI_IS_32BIT
   1.863 +    add         rdi,        DWORD PTR arg(3) ;[out_pitch]
   1.864 +%else
   1.865 +    add         rdi,        r8
   1.866 +%endif
   1.867 +    dec         rcx
   1.868 +    jnz         .vp8_filter_block1d4_v4_ssse3_loop
   1.869 +
   1.870 +    ; begin epilog
   1.871 +    pop rdi
   1.872 +    pop rsi
   1.873 +    RESTORE_GOT
   1.874 +    UNSHADOW_ARGS
   1.875 +    pop         rbp
   1.876 +    ret
   1.877 +
   1.878 +;void vp8_bilinear_predict16x16_ssse3
   1.879 +;(
   1.880 +;    unsigned char  *src_ptr,
   1.881 +;    int   src_pixels_per_line,
   1.882 +;    int  xoffset,
   1.883 +;    int  yoffset,
   1.884 +;    unsigned char *dst_ptr,
   1.885 +;    int dst_pitch
   1.886 +;)
   1.887 +global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE
   1.888 +sym(vp8_bilinear_predict16x16_ssse3):
   1.889 +    push        rbp
   1.890 +    mov         rbp, rsp
   1.891 +    SHADOW_ARGS_TO_STACK 6
   1.892 +    SAVE_XMM 7
   1.893 +    GET_GOT     rbx
   1.894 +    push        rsi
   1.895 +    push        rdi
   1.896 +    ; end prolog
   1.897 +
   1.898 +        lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
   1.899 +        movsxd      rax,        dword ptr arg(2)    ; xoffset
   1.900 +
   1.901 +        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
   1.902 +        je          .b16x16_sp_only
   1.903 +
   1.904 +        shl         rax,        4
   1.905 +        lea         rax,        [rax + rcx]         ; HFilter
   1.906 +
   1.907 +        mov         rdi,        arg(4)              ; dst_ptr
   1.908 +        mov         rsi,        arg(0)              ; src_ptr
   1.909 +        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
   1.910 +
   1.911 +        movdqa      xmm1,       [rax]
   1.912 +
   1.913 +        movsxd      rax,        dword ptr arg(3)    ; yoffset
   1.914 +
   1.915 +        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
   1.916 +        je          .b16x16_fp_only
   1.917 +
   1.918 +        shl         rax,        4
   1.919 +        lea         rax,        [rax + rcx]         ; VFilter
   1.920 +
   1.921 +        lea         rcx,        [rdi+rdx*8]
   1.922 +        lea         rcx,        [rcx+rdx*8]
   1.923 +        movsxd      rdx,        dword ptr arg(1)    ; src_pixels_per_line
   1.924 +
   1.925 +        movdqa      xmm2,       [rax]
   1.926 +
   1.927 +%if ABI_IS_32BIT=0
   1.928 +        movsxd      r8,         dword ptr arg(5)    ; dst_pitch
   1.929 +%endif
   1.930 +        movq        xmm3,       [rsi]               ; 00 01 02 03 04 05 06 07
   1.931 +        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
   1.932 +
   1.933 +        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
   1.934 +        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
   1.935 +
   1.936 +        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
   1.937 +
   1.938 +        lea         rsi,        [rsi + rdx]         ; next line
   1.939 +
   1.940 +        pmaddubsw   xmm3,       xmm1                ; 00 02 04 06 08 10 12 14
   1.941 +
   1.942 +        punpcklbw   xmm4,       xmm5                ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
   1.943 +        pmaddubsw   xmm4,       xmm1                ; 01 03 05 07 09 11 13 15
   1.944 +
   1.945 +        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
   1.946 +        psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
   1.947 +
   1.948 +        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
   1.949 +        psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
   1.950 +
   1.951 +        movdqa      xmm7,       xmm3
   1.952 +        packuswb    xmm7,       xmm4                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
   1.953 +
   1.954 +.next_row:
   1.955 +        movq        xmm6,       [rsi]               ; 00 01 02 03 04 05 06 07
   1.956 +        movq        xmm5,       [rsi+1]             ; 01 02 03 04 05 06 07 08
   1.957 +
   1.958 +        punpcklbw   xmm6,       xmm5
   1.959 +        movq        xmm4,       [rsi+8]             ; 08 09 10 11 12 13 14 15
   1.960 +
   1.961 +        movq        xmm5,       [rsi+9]             ; 09 10 11 12 13 14 15 16
   1.962 +        lea         rsi,        [rsi + rdx]         ; next line
   1.963 +
   1.964 +        pmaddubsw   xmm6,       xmm1
   1.965 +
   1.966 +        punpcklbw   xmm4,       xmm5
   1.967 +        pmaddubsw   xmm4,       xmm1
   1.968 +
   1.969 +        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
   1.970 +        psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
   1.971 +
   1.972 +        paddw       xmm4,       [GLOBAL(rd)]        ; xmm4 += round value
   1.973 +        psraw       xmm4,       VP8_FILTER_SHIFT    ; xmm4 /= 128
   1.974 +
   1.975 +        packuswb    xmm6,       xmm4
   1.976 +        movdqa      xmm5,       xmm7
   1.977 +
   1.978 +        punpcklbw   xmm5,       xmm6
   1.979 +        pmaddubsw   xmm5,       xmm2
   1.980 +
   1.981 +        punpckhbw   xmm7,       xmm6
   1.982 +        pmaddubsw   xmm7,       xmm2
   1.983 +
   1.984 +        paddw       xmm5,       [GLOBAL(rd)]        ; xmm5 += round value
   1.985 +        psraw       xmm5,       VP8_FILTER_SHIFT    ; xmm5 /= 128
   1.986 +
   1.987 +        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
   1.988 +        psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
   1.989 +
   1.990 +        packuswb    xmm5,       xmm7
   1.991 +        movdqa      xmm7,       xmm6
   1.992 +
   1.993 +        movdqa      [rdi],      xmm5                ; store the results in the destination
   1.994 +%if ABI_IS_32BIT
   1.995 +        add         rdi,        DWORD PTR arg(5)    ; dst_pitch
   1.996 +%else
   1.997 +        add         rdi,        r8
   1.998 +%endif
   1.999 +
  1.1000 +        cmp         rdi,        rcx
  1.1001 +        jne         .next_row
  1.1002 +
  1.1003 +        jmp         .done
  1.1004 +
  1.1005 +.b16x16_sp_only:
  1.1006 +        movsxd      rax,        dword ptr arg(3)    ; yoffset
  1.1007 +        shl         rax,        4
  1.1008 +        lea         rax,        [rax + rcx]         ; VFilter
  1.1009 +
  1.1010 +        mov         rdi,        arg(4)              ; dst_ptr
  1.1011 +        mov         rsi,        arg(0)              ; src_ptr
  1.1012 +        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
  1.1013 +
  1.1014 +        movdqa      xmm1,       [rax]               ; VFilter
  1.1015 +
  1.1016 +        lea         rcx,        [rdi+rdx*8]
  1.1017 +        lea         rcx,        [rcx+rdx*8]
  1.1018 +        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
  1.1019 +
  1.1020 +        ; get the first horizontal line done
  1.1021 +        movq        xmm4,       [rsi]               ; load row 0
  1.1022 +        movq        xmm2,       [rsi + 8]           ; load row 0
  1.1023 +
  1.1024 +        lea         rsi,        [rsi + rax]         ; next line
  1.1025 +.next_row_sp:
  1.1026 +        movq        xmm3,       [rsi]               ; load row + 1
  1.1027 +        movq        xmm5,       [rsi + 8]           ; load row + 1
  1.1028 +
  1.1029 +        punpcklbw   xmm4,       xmm3
  1.1030 +        punpcklbw   xmm2,       xmm5
  1.1031 +
  1.1032 +        pmaddubsw   xmm4,       xmm1
  1.1033 +        movq        xmm7,       [rsi + rax]         ; load row + 2
  1.1034 +
  1.1035 +        pmaddubsw   xmm2,       xmm1
  1.1036 +        movq        xmm6,       [rsi + rax + 8]     ; load row + 2
  1.1037 +
  1.1038 +        punpcklbw   xmm3,       xmm7
  1.1039 +        punpcklbw   xmm5,       xmm6
  1.1040 +
  1.1041 +        pmaddubsw   xmm3,       xmm1
  1.1042 +        paddw       xmm4,       [GLOBAL(rd)]
  1.1043 +
  1.1044 +        pmaddubsw   xmm5,       xmm1
  1.1045 +        paddw       xmm2,       [GLOBAL(rd)]
  1.1046 +
  1.1047 +        psraw       xmm4,       VP8_FILTER_SHIFT
  1.1048 +        psraw       xmm2,       VP8_FILTER_SHIFT
  1.1049 +
  1.1050 +        packuswb    xmm4,       xmm2
  1.1051 +        paddw       xmm3,       [GLOBAL(rd)]
  1.1052 +
  1.1053 +        movdqa      [rdi],      xmm4                ; store row 0
  1.1054 +        paddw       xmm5,       [GLOBAL(rd)]
  1.1055 +
  1.1056 +        psraw       xmm3,       VP8_FILTER_SHIFT
  1.1057 +        psraw       xmm5,       VP8_FILTER_SHIFT
  1.1058 +
  1.1059 +        packuswb    xmm3,       xmm5
  1.1060 +        movdqa      xmm4,       xmm7
  1.1061 +
  1.1062 +        movdqa      [rdi + rdx],xmm3                ; store row 1
  1.1063 +        lea         rsi,        [rsi + 2*rax]
  1.1064 +
  1.1065 +        movdqa      xmm2,       xmm6
  1.1066 +        lea         rdi,        [rdi + 2*rdx]
  1.1067 +
  1.1068 +        cmp         rdi,        rcx
  1.1069 +        jne         .next_row_sp
  1.1070 +
  1.1071 +        jmp         .done
  1.1072 +
  1.1073 +.b16x16_fp_only:
  1.1074 +        lea         rcx,        [rdi+rdx*8]
  1.1075 +        lea         rcx,        [rcx+rdx*8]
  1.1076 +        movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
  1.1077 +
  1.1078 +.next_row_fp:
  1.1079 +        movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
  1.1080 +        movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
  1.1081 +
  1.1082 +        punpcklbw   xmm2,       xmm4
  1.1083 +        movq        xmm3,       [rsi+8]             ; 08 09 10 11 12 13 14 15
  1.1084 +
  1.1085 +        pmaddubsw   xmm2,       xmm1
  1.1086 +        movq        xmm4,       [rsi+9]             ; 09 10 11 12 13 14 15 16
  1.1087 +
  1.1088 +        lea         rsi,        [rsi + rax]         ; next line
  1.1089 +        punpcklbw   xmm3,       xmm4
  1.1090 +
  1.1091 +        pmaddubsw   xmm3,       xmm1
  1.1092 +        movq        xmm5,       [rsi]
  1.1093 +
  1.1094 +        paddw       xmm2,       [GLOBAL(rd)]
  1.1095 +        movq        xmm7,       [rsi+1]
  1.1096 +
  1.1097 +        movq        xmm6,       [rsi+8]
  1.1098 +        psraw       xmm2,       VP8_FILTER_SHIFT
  1.1099 +
  1.1100 +        punpcklbw   xmm5,       xmm7
  1.1101 +        movq        xmm7,       [rsi+9]
  1.1102 +
  1.1103 +        paddw       xmm3,       [GLOBAL(rd)]
  1.1104 +        pmaddubsw   xmm5,       xmm1
  1.1105 +
  1.1106 +        psraw       xmm3,       VP8_FILTER_SHIFT
  1.1107 +        punpcklbw   xmm6,       xmm7
  1.1108 +
  1.1109 +        packuswb    xmm2,       xmm3
  1.1110 +        pmaddubsw   xmm6,       xmm1
  1.1111 +
  1.1112 +        movdqa      [rdi],      xmm2                ; store the results in the destination
  1.1113 +        paddw       xmm5,       [GLOBAL(rd)]
  1.1114 +
  1.1115 +        lea         rdi,        [rdi + rdx]         ; dst_pitch
  1.1116 +        psraw       xmm5,       VP8_FILTER_SHIFT
  1.1117 +
  1.1118 +        paddw       xmm6,       [GLOBAL(rd)]
  1.1119 +        psraw       xmm6,       VP8_FILTER_SHIFT
  1.1120 +
  1.1121 +        packuswb    xmm5,       xmm6
  1.1122 +        lea         rsi,        [rsi + rax]         ; next line
  1.1123 +
  1.1124 +        movdqa      [rdi],      xmm5                ; store the results in the destination
  1.1125 +        lea         rdi,        [rdi + rdx]         ; dst_pitch
  1.1126 +
  1.1127 +        cmp         rdi,        rcx
  1.1128 +
  1.1129 +        jne         .next_row_fp
  1.1130 +
  1.1131 +.done:
  1.1132 +    ; begin epilog
  1.1133 +    pop         rdi
  1.1134 +    pop         rsi
  1.1135 +    RESTORE_GOT
  1.1136 +    RESTORE_XMM
  1.1137 +    UNSHADOW_ARGS
  1.1138 +    pop         rbp
  1.1139 +    ret
  1.1140 +
  1.1141 +;void vp8_bilinear_predict8x8_ssse3
  1.1142 +;(
  1.1143 +;    unsigned char  *src_ptr,
  1.1144 +;    int   src_pixels_per_line,
  1.1145 +;    int  xoffset,
  1.1146 +;    int  yoffset,
  1.1147 +;    unsigned char *dst_ptr,
  1.1148 +;    int dst_pitch
  1.1149 +;)
  1.1150 +global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE
  1.1151 +sym(vp8_bilinear_predict8x8_ssse3):
  1.1152 +    push        rbp
  1.1153 +    mov         rbp, rsp
  1.1154 +    SHADOW_ARGS_TO_STACK 6
  1.1155 +    SAVE_XMM 7
  1.1156 +    GET_GOT     rbx
  1.1157 +    push        rsi
  1.1158 +    push        rdi
  1.1159 +    ; end prolog
  1.1160 +
  1.1161 +    ALIGN_STACK 16, rax
  1.1162 +    sub         rsp, 144                         ; reserve 144 bytes
  1.1163 +
  1.1164 +        lea         rcx,        [GLOBAL(vp8_bilinear_filters_ssse3)]
  1.1165 +
  1.1166 +        mov         rsi,        arg(0) ;src_ptr
  1.1167 +        movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line
  1.1168 +
  1.1169 +    ;Read 9-line unaligned data in and put them on stack. This gives a big
  1.1170 +    ;performance boost.
  1.1171 +        movdqu      xmm0,       [rsi]
  1.1172 +        lea         rax,        [rdx + rdx*2]
  1.1173 +        movdqu      xmm1,       [rsi+rdx]
  1.1174 +        movdqu      xmm2,       [rsi+rdx*2]
  1.1175 +        add         rsi,        rax
  1.1176 +        movdqu      xmm3,       [rsi]
  1.1177 +        movdqu      xmm4,       [rsi+rdx]
  1.1178 +        movdqu      xmm5,       [rsi+rdx*2]
  1.1179 +        add         rsi,        rax
  1.1180 +        movdqu      xmm6,       [rsi]
  1.1181 +        movdqu      xmm7,       [rsi+rdx]
  1.1182 +
  1.1183 +        movdqa      XMMWORD PTR [rsp],            xmm0
  1.1184 +
  1.1185 +        movdqu      xmm0,       [rsi+rdx*2]
  1.1186 +
  1.1187 +        movdqa      XMMWORD PTR [rsp+16],         xmm1
  1.1188 +        movdqa      XMMWORD PTR [rsp+32],         xmm2
  1.1189 +        movdqa      XMMWORD PTR [rsp+48],         xmm3
  1.1190 +        movdqa      XMMWORD PTR [rsp+64],         xmm4
  1.1191 +        movdqa      XMMWORD PTR [rsp+80],         xmm5
  1.1192 +        movdqa      XMMWORD PTR [rsp+96],         xmm6
  1.1193 +        movdqa      XMMWORD PTR [rsp+112],        xmm7
  1.1194 +        movdqa      XMMWORD PTR [rsp+128],        xmm0
  1.1195 +
  1.1196 +        movsxd      rax,        dword ptr arg(2)    ; xoffset
  1.1197 +        cmp         rax,        0                   ; skip first_pass filter if xoffset=0
  1.1198 +        je          .b8x8_sp_only
  1.1199 +
  1.1200 +        shl         rax,        4
  1.1201 +        add         rax,        rcx                 ; HFilter
  1.1202 +
  1.1203 +        mov         rdi,        arg(4)              ; dst_ptr
  1.1204 +        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
  1.1205 +
  1.1206 +        movdqa      xmm0,       [rax]
  1.1207 +
  1.1208 +        movsxd      rax,        dword ptr arg(3)    ; yoffset
  1.1209 +        cmp         rax,        0                   ; skip second_pass filter if yoffset=0
  1.1210 +        je          .b8x8_fp_only
  1.1211 +
  1.1212 +        shl         rax,        4
  1.1213 +        lea         rax,        [rax + rcx]         ; VFilter
  1.1214 +
  1.1215 +        lea         rcx,        [rdi+rdx*8]
  1.1216 +
  1.1217 +        movdqa      xmm1,       [rax]
  1.1218 +
  1.1219 +        ; get the first horizontal line done
  1.1220 +        movdqa      xmm3,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
  1.1221 +        movdqa      xmm5,       xmm3                ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
  1.1222 +
  1.1223 +        psrldq      xmm5,       1
  1.1224 +        lea         rsp,        [rsp + 16]          ; next line
  1.1225 +
  1.1226 +        punpcklbw   xmm3,       xmm5                ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
  1.1227 +        pmaddubsw   xmm3,       xmm0                ; 00 02 04 06 08 10 12 14
  1.1228 +
  1.1229 +        paddw       xmm3,       [GLOBAL(rd)]        ; xmm3 += round value
  1.1230 +        psraw       xmm3,       VP8_FILTER_SHIFT    ; xmm3 /= 128
  1.1231 +
  1.1232 +        movdqa      xmm7,       xmm3
  1.1233 +        packuswb    xmm7,       xmm7                ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
  1.1234 +
  1.1235 +.next_row:
  1.1236 +        movdqa      xmm6,       [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
  1.1237 +        lea         rsp,        [rsp + 16]          ; next line
  1.1238 +
  1.1239 +        movdqa      xmm5,       xmm6
  1.1240 +
  1.1241 +        psrldq      xmm5,       1
  1.1242 +
  1.1243 +        punpcklbw   xmm6,       xmm5
  1.1244 +        pmaddubsw   xmm6,       xmm0
  1.1245 +
  1.1246 +        paddw       xmm6,       [GLOBAL(rd)]        ; xmm6 += round value
  1.1247 +        psraw       xmm6,       VP8_FILTER_SHIFT    ; xmm6 /= 128
  1.1248 +
  1.1249 +        packuswb    xmm6,       xmm6
  1.1250 +
  1.1251 +        punpcklbw   xmm7,       xmm6
  1.1252 +        pmaddubsw   xmm7,       xmm1
  1.1253 +
  1.1254 +        paddw       xmm7,       [GLOBAL(rd)]        ; xmm7 += round value
  1.1255 +        psraw       xmm7,       VP8_FILTER_SHIFT    ; xmm7 /= 128
  1.1256 +
  1.1257 +        packuswb    xmm7,       xmm7
  1.1258 +
  1.1259 +        movq        [rdi],      xmm7                ; store the results in the destination
  1.1260 +        lea         rdi,        [rdi + rdx]
  1.1261 +
  1.1262 +        movdqa      xmm7,       xmm6
  1.1263 +
  1.1264 +        cmp         rdi,        rcx
  1.1265 +        jne         .next_row
  1.1266 +
  1.1267 +        jmp         .done8x8
  1.1268 +
  1.1269 +.b8x8_sp_only:
  1.1270 +        movsxd      rax,        dword ptr arg(3)    ; yoffset
  1.1271 +        shl         rax,        4
  1.1272 +        lea         rax,        [rax + rcx]         ; VFilter
  1.1273 +
  1.1274 +        mov         rdi,        arg(4) ;dst_ptr
  1.1275 +        movsxd      rdx,        dword ptr arg(5)    ; dst_pitch
  1.1276 +
  1.1277 +        movdqa      xmm0,       [rax]               ; VFilter
  1.1278 +
  1.1279 +        movq        xmm1,       XMMWORD PTR [rsp]
  1.1280 +        movq        xmm2,       XMMWORD PTR [rsp+16]
  1.1281 +
  1.1282 +        movq        xmm3,       XMMWORD PTR [rsp+32]
  1.1283 +        punpcklbw   xmm1,       xmm2
  1.1284 +
  1.1285 +        movq        xmm4,       XMMWORD PTR [rsp+48]
  1.1286 +        punpcklbw   xmm2,       xmm3
  1.1287 +
  1.1288 +        movq        xmm5,       XMMWORD PTR [rsp+64]
  1.1289 +        punpcklbw   xmm3,       xmm4
  1.1290 +
  1.1291 +        movq        xmm6,       XMMWORD PTR [rsp+80]
  1.1292 +        punpcklbw   xmm4,       xmm5
  1.1293 +
  1.1294 +        movq        xmm7,       XMMWORD PTR [rsp+96]
  1.1295 +        punpcklbw   xmm5,       xmm6
  1.1296 +
  1.1297 +        pmaddubsw   xmm1,       xmm0
  1.1298 +        pmaddubsw   xmm2,       xmm0
  1.1299 +
  1.1300 +        pmaddubsw   xmm3,       xmm0
  1.1301 +        pmaddubsw   xmm4,       xmm0
  1.1302 +
  1.1303 +        pmaddubsw   xmm5,       xmm0
  1.1304 +        punpcklbw   xmm6,       xmm7
  1.1305 +
  1.1306 +        pmaddubsw   xmm6,       xmm0
  1.1307 +        paddw       xmm1,       [GLOBAL(rd)]
  1.1308 +
  1.1309 +        paddw       xmm2,       [GLOBAL(rd)]
  1.1310 +        psraw       xmm1,       VP8_FILTER_SHIFT
  1.1311 +
  1.1312 +        paddw       xmm3,       [GLOBAL(rd)]
  1.1313 +        psraw       xmm2,       VP8_FILTER_SHIFT
  1.1314 +
  1.1315 +        paddw       xmm4,       [GLOBAL(rd)]
  1.1316 +        psraw       xmm3,       VP8_FILTER_SHIFT
  1.1317 +
  1.1318 +        paddw       xmm5,       [GLOBAL(rd)]
  1.1319 +        psraw       xmm4,       VP8_FILTER_SHIFT
  1.1320 +
  1.1321 +        paddw       xmm6,       [GLOBAL(rd)]
  1.1322 +        psraw       xmm5,       VP8_FILTER_SHIFT
  1.1323 +
  1.1324 +        psraw       xmm6,       VP8_FILTER_SHIFT
  1.1325 +        packuswb    xmm1,       xmm1
  1.1326 +
  1.1327 +        packuswb    xmm2,       xmm2
  1.1328 +        movq        [rdi],      xmm1
  1.1329 +
  1.1330 +        packuswb    xmm3,       xmm3
  1.1331 +        movq        [rdi+rdx],  xmm2
  1.1332 +
  1.1333 +        packuswb    xmm4,       xmm4
  1.1334 +        movq        xmm1,       XMMWORD PTR [rsp+112]
  1.1335 +
  1.1336 +        lea         rdi,        [rdi + 2*rdx]
  1.1337 +        movq        xmm2,       XMMWORD PTR [rsp+128]
  1.1338 +
  1.1339 +        packuswb    xmm5,       xmm5
  1.1340 +        movq        [rdi],      xmm3
  1.1341 +
  1.1342 +        packuswb    xmm6,       xmm6
  1.1343 +        movq        [rdi+rdx],  xmm4
  1.1344 +
  1.1345 +        lea         rdi,        [rdi + 2*rdx]
  1.1346 +        punpcklbw   xmm7,       xmm1
  1.1347 +
  1.1348 +        movq        [rdi],      xmm5
  1.1349 +        pmaddubsw   xmm7,       xmm0
  1.1350 +
  1.1351 +        movq        [rdi+rdx],  xmm6
  1.1352 +        punpcklbw   xmm1,       xmm2
  1.1353 +
  1.1354 +        pmaddubsw   xmm1,       xmm0
  1.1355 +        paddw       xmm7,       [GLOBAL(rd)]
  1.1356 +
  1.1357 +        psraw       xmm7,       VP8_FILTER_SHIFT
  1.1358 +        paddw       xmm1,       [GLOBAL(rd)]
  1.1359 +
  1.1360 +        psraw       xmm1,       VP8_FILTER_SHIFT
  1.1361 +        packuswb    xmm7,       xmm7
  1.1362 +
  1.1363 +        packuswb    xmm1,       xmm1
  1.1364 +        lea         rdi,        [rdi + 2*rdx]
  1.1365 +
  1.1366 +        movq        [rdi],      xmm7
  1.1367 +
  1.1368 +        movq        [rdi+rdx],  xmm1
  1.1369 +        lea         rsp,        [rsp + 144]
  1.1370 +
  1.1371 +        jmp         .done8x8
  1.1372 +
  1.1373 +.b8x8_fp_only:
  1.1374 +        lea         rcx,        [rdi+rdx*8]
  1.1375 +
  1.1376 +.next_row_fp:
  1.1377 +        movdqa      xmm1,       XMMWORD PTR [rsp]
  1.1378 +        movdqa      xmm3,       XMMWORD PTR [rsp+16]
  1.1379 +
  1.1380 +        movdqa      xmm2,       xmm1
  1.1381 +        movdqa      xmm5,       XMMWORD PTR [rsp+32]
  1.1382 +
  1.1383 +        psrldq      xmm2,       1
  1.1384 +        movdqa      xmm7,       XMMWORD PTR [rsp+48]
  1.1385 +
  1.1386 +        movdqa      xmm4,       xmm3
  1.1387 +        psrldq      xmm4,       1
  1.1388 +
  1.1389 +        movdqa      xmm6,       xmm5
  1.1390 +        psrldq      xmm6,       1
  1.1391 +
  1.1392 +        punpcklbw   xmm1,       xmm2
  1.1393 +        pmaddubsw   xmm1,       xmm0
  1.1394 +
  1.1395 +        punpcklbw   xmm3,       xmm4
  1.1396 +        pmaddubsw   xmm3,       xmm0
  1.1397 +
  1.1398 +        punpcklbw   xmm5,       xmm6
  1.1399 +        pmaddubsw   xmm5,       xmm0
  1.1400 +
  1.1401 +        movdqa      xmm2,       xmm7
  1.1402 +        psrldq      xmm2,       1
  1.1403 +
  1.1404 +        punpcklbw   xmm7,       xmm2
  1.1405 +        pmaddubsw   xmm7,       xmm0
  1.1406 +
  1.1407 +        paddw       xmm1,       [GLOBAL(rd)]
  1.1408 +        psraw       xmm1,       VP8_FILTER_SHIFT
  1.1409 +
  1.1410 +        paddw       xmm3,       [GLOBAL(rd)]
  1.1411 +        psraw       xmm3,       VP8_FILTER_SHIFT
  1.1412 +
  1.1413 +        paddw       xmm5,       [GLOBAL(rd)]
  1.1414 +        psraw       xmm5,       VP8_FILTER_SHIFT
  1.1415 +
  1.1416 +        paddw       xmm7,       [GLOBAL(rd)]
  1.1417 +        psraw       xmm7,       VP8_FILTER_SHIFT
  1.1418 +
  1.1419 +        packuswb    xmm1,       xmm1
  1.1420 +        packuswb    xmm3,       xmm3
  1.1421 +
  1.1422 +        packuswb    xmm5,       xmm5
  1.1423 +        movq        [rdi],      xmm1
  1.1424 +
  1.1425 +        packuswb    xmm7,       xmm7
  1.1426 +        movq        [rdi+rdx],  xmm3
  1.1427 +
  1.1428 +        lea         rdi,        [rdi + 2*rdx]
  1.1429 +        movq        [rdi],      xmm5
  1.1430 +
  1.1431 +        lea         rsp,        [rsp + 4*16]
  1.1432 +        movq        [rdi+rdx],  xmm7
  1.1433 +
  1.1434 +        lea         rdi,        [rdi + 2*rdx]
  1.1435 +        cmp         rdi,        rcx
  1.1436 +
  1.1437 +        jne         .next_row_fp
  1.1438 +
  1.1439 +        lea         rsp,        [rsp + 16]
  1.1440 +
  1.1441 +.done8x8:
  1.1442 +    ;add rsp, 144
  1.1443 +    pop         rsp
  1.1444 +    ; begin epilog
  1.1445 +    pop         rdi
  1.1446 +    pop         rsi
  1.1447 +    RESTORE_GOT
  1.1448 +    RESTORE_XMM
  1.1449 +    UNSHADOW_ARGS
  1.1450 +    pop         rbp
  1.1451 +    ret
  1.1452 +
  1.1453 +SECTION_RODATA
  1.1454 +align 16
  1.1455 +shuf1b:
  1.1456 +    db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
  1.1457 +shuf2b:
  1.1458 +    db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
  1.1459 +shuf3b:
  1.1460 +    db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
  1.1461 +
  1.1462 +align 16
  1.1463 +shuf2bfrom1:
  1.1464 +    db  4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
  1.1465 +align 16
  1.1466 +shuf3bfrom1:
  1.1467 +    db  2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
  1.1468 +
  1.1469 +align 16
  1.1470 +rd:
  1.1471 +    times 8 dw 0x40
  1.1472 +
  1.1473 +align 16
  1.1474 +k0_k5:
  1.1475 +    times 8 db 0, 0             ;placeholder
  1.1476 +    times 8 db 0, 0
  1.1477 +    times 8 db 2, 1
  1.1478 +    times 8 db 0, 0
  1.1479 +    times 8 db 3, 3
  1.1480 +    times 8 db 0, 0
  1.1481 +    times 8 db 1, 2
  1.1482 +    times 8 db 0, 0
  1.1483 +k1_k3:
  1.1484 +    times 8 db  0,    0         ;placeholder
  1.1485 +    times 8 db  -6,  12
  1.1486 +    times 8 db -11,  36
  1.1487 +    times 8 db  -9,  50
  1.1488 +    times 8 db -16,  77
  1.1489 +    times 8 db  -6,  93
  1.1490 +    times 8 db  -8, 108
  1.1491 +    times 8 db  -1, 123
  1.1492 +k2_k4:
  1.1493 +    times 8 db 128,    0        ;placeholder
  1.1494 +    times 8 db 123,   -1
  1.1495 +    times 8 db 108,   -8
  1.1496 +    times 8 db  93,   -6
  1.1497 +    times 8 db  77,  -16
  1.1498 +    times 8 db  50,   -9
  1.1499 +    times 8 db  36,  -11
  1.1500 +    times 8 db  12,   -6
  1.1501 +align 16
  1.1502 +vp8_bilinear_filters_ssse3:
  1.1503 +    times 8 db 128, 0
  1.1504 +    times 8 db 112, 16
  1.1505 +    times 8 db 96,  32
  1.1506 +    times 8 db 80,  48
  1.1507 +    times 8 db 64,  64
  1.1508 +    times 8 db 48,  80
  1.1509 +    times 8 db 32,  96
  1.1510 +    times 8 db 16,  112
  1.1511 +

mercurial