michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: ;Note: tap3 and tap4 have to be applied and added after other taps to avoid michael@0: ;overflow. michael@0: michael@0: %macro GET_FILTERS_4 0 michael@0: mov rdx, arg(5) ;filter ptr michael@0: mov rcx, 0x0400040 michael@0: michael@0: movdqa xmm7, [rdx] ;load filters michael@0: pshuflw xmm0, xmm7, 0b ;k0 michael@0: pshuflw xmm1, xmm7, 01010101b ;k1 michael@0: pshuflw xmm2, xmm7, 10101010b ;k2 michael@0: pshuflw xmm3, xmm7, 11111111b ;k3 michael@0: psrldq xmm7, 8 michael@0: pshuflw xmm4, xmm7, 0b ;k4 michael@0: pshuflw xmm5, xmm7, 01010101b ;k5 michael@0: pshuflw xmm6, xmm7, 10101010b ;k6 michael@0: pshuflw xmm7, xmm7, 11111111b ;k7 michael@0: michael@0: punpcklqdq xmm0, xmm1 michael@0: punpcklqdq xmm2, xmm3 michael@0: punpcklqdq xmm5, xmm4 michael@0: punpcklqdq xmm6, xmm7 michael@0: michael@0: movdqa k0k1, xmm0 michael@0: movdqa k2k3, xmm2 michael@0: movdqa k5k4, xmm5 michael@0: movdqa k6k7, xmm6 michael@0: michael@0: movq xmm6, rcx michael@0: pshufd xmm6, xmm6, 0 michael@0: movdqa krd, xmm6 michael@0: michael@0: pxor xmm7, xmm7 michael@0: movdqa zero, xmm7 michael@0: %endm michael@0: michael@0: %macro APPLY_FILTER_4 1 michael@0: punpckldq xmm0, xmm1 ;two row in one register michael@0: punpckldq xmm6, xmm7 michael@0: punpckldq xmm2, xmm3 michael@0: punpckldq xmm5, xmm4 michael@0: michael@0: punpcklbw xmm0, zero ;unpack to word michael@0: punpcklbw xmm6, zero michael@0: punpcklbw xmm2, zero michael@0: punpcklbw xmm5, zero michael@0: michael@0: pmullw xmm0, k0k1 ;multiply the filter factors michael@0: pmullw xmm6, k6k7 michael@0: pmullw xmm2, k2k3 michael@0: pmullw xmm5, k5k4 michael@0: michael@0: paddsw xmm0, xmm6 ;sum michael@0: movdqa xmm1, xmm0 michael@0: psrldq xmm1, 8 michael@0: paddsw xmm0, xmm1 michael@0: paddsw xmm0, xmm2 michael@0: psrldq xmm2, 8 michael@0: paddsw xmm0, xmm5 michael@0: psrldq xmm5, 8 michael@0: paddsw xmm0, xmm2 michael@0: paddsw xmm0, xmm5 michael@0: michael@0: paddsw xmm0, krd ;rounding michael@0: psraw xmm0, 7 ;shift michael@0: packuswb xmm0, xmm0 ;pack to byte michael@0: michael@0: %if %1 michael@0: movd xmm1, [rdi] michael@0: pavgb xmm0, xmm1 michael@0: %endif michael@0: movd [rdi], xmm0 michael@0: %endm michael@0: michael@0: %macro GET_FILTERS 0 michael@0: mov rdx, arg(5) ;filter ptr michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(2) ;output_ptr michael@0: mov rcx, 0x0400040 michael@0: michael@0: movdqa xmm7, [rdx] ;load filters michael@0: pshuflw xmm0, xmm7, 0b ;k0 michael@0: pshuflw xmm1, xmm7, 01010101b ;k1 michael@0: pshuflw xmm2, xmm7, 10101010b ;k2 michael@0: pshuflw xmm3, xmm7, 11111111b ;k3 michael@0: pshufhw xmm4, xmm7, 0b ;k4 michael@0: pshufhw xmm5, xmm7, 01010101b ;k5 michael@0: pshufhw xmm6, xmm7, 10101010b ;k6 michael@0: pshufhw xmm7, xmm7, 11111111b ;k7 michael@0: michael@0: punpcklwd xmm0, xmm0 michael@0: punpcklwd xmm1, xmm1 michael@0: punpcklwd xmm2, xmm2 michael@0: punpcklwd xmm3, xmm3 michael@0: punpckhwd xmm4, xmm4 michael@0: punpckhwd xmm5, xmm5 michael@0: punpckhwd xmm6, xmm6 michael@0: punpckhwd xmm7, xmm7 michael@0: michael@0: movdqa k0, xmm0 ;store filter factors on stack michael@0: movdqa k1, xmm1 michael@0: movdqa k2, xmm2 michael@0: movdqa k3, xmm3 michael@0: movdqa k4, xmm4 michael@0: movdqa k5, xmm5 michael@0: movdqa k6, xmm6 michael@0: movdqa k7, xmm7 michael@0: michael@0: movq xmm6, rcx michael@0: pshufd xmm6, xmm6, 0 michael@0: movdqa krd, xmm6 ;rounding michael@0: michael@0: pxor xmm7, xmm7 michael@0: movdqa zero, xmm7 michael@0: %endm michael@0: michael@0: %macro LOAD_VERT_8 1 michael@0: movq xmm0, [rsi + %1] ;0 michael@0: movq xmm1, [rsi + rax + %1] ;1 michael@0: movq xmm6, [rsi + rdx * 2 + %1] ;6 michael@0: lea rsi, [rsi + rax] michael@0: movq xmm7, [rsi + rdx * 2 + %1] ;7 michael@0: movq xmm2, [rsi + rax + %1] ;2 michael@0: movq xmm3, [rsi + rax * 2 + %1] ;3 michael@0: movq xmm4, [rsi + rdx + %1] ;4 michael@0: movq xmm5, [rsi + rax * 4 + %1] ;5 michael@0: %endm michael@0: michael@0: %macro APPLY_FILTER_8 2 michael@0: punpcklbw xmm0, zero michael@0: punpcklbw xmm1, zero michael@0: punpcklbw xmm6, zero michael@0: punpcklbw xmm7, zero michael@0: punpcklbw xmm2, zero michael@0: punpcklbw xmm5, zero michael@0: punpcklbw xmm3, zero michael@0: punpcklbw xmm4, zero michael@0: michael@0: pmullw xmm0, k0 michael@0: pmullw xmm1, k1 michael@0: pmullw xmm6, k6 michael@0: pmullw xmm7, k7 michael@0: pmullw xmm2, k2 michael@0: pmullw xmm5, k5 michael@0: pmullw xmm3, k3 michael@0: pmullw xmm4, k4 michael@0: michael@0: paddsw xmm0, xmm1 michael@0: paddsw xmm0, xmm6 michael@0: paddsw xmm0, xmm7 michael@0: paddsw xmm0, xmm2 michael@0: paddsw xmm0, xmm5 michael@0: paddsw xmm0, xmm3 michael@0: paddsw xmm0, xmm4 michael@0: michael@0: paddsw xmm0, krd ;rounding michael@0: psraw xmm0, 7 ;shift michael@0: packuswb xmm0, xmm0 ;pack back to byte michael@0: %if %1 michael@0: movq xmm1, [rdi + %2] michael@0: pavgb xmm0, xmm1 michael@0: %endif michael@0: movq [rdi + %2], xmm0 michael@0: %endm michael@0: michael@0: ;void vp9_filter_block1d4_v8_sse2 michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; unsigned int src_pitch, michael@0: ; unsigned char *output_ptr, michael@0: ; unsigned int out_pitch, michael@0: ; unsigned int output_height, michael@0: ; short *filter michael@0: ;) michael@0: global sym(vp9_filter_block1d4_v8_sse2) PRIVATE michael@0: sym(vp9_filter_block1d4_v8_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16 * 6 michael@0: %define k0k1 [rsp + 16 * 0] michael@0: %define k2k3 [rsp + 16 * 1] michael@0: %define k5k4 [rsp + 16 * 2] michael@0: %define k6k7 [rsp + 16 * 3] michael@0: %define krd [rsp + 16 * 4] michael@0: %define zero [rsp + 16 * 5] michael@0: michael@0: GET_FILTERS_4 michael@0: michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(2) ;output_ptr michael@0: michael@0: movsxd rax, DWORD PTR arg(1) ;pixels_per_line michael@0: movsxd rbx, DWORD PTR arg(3) ;out_pitch michael@0: lea rdx, [rax + rax * 2] michael@0: movsxd rcx, DWORD PTR arg(4) ;output_height michael@0: michael@0: .loop: michael@0: movd xmm0, [rsi] ;load src: row 0 michael@0: movd xmm1, [rsi + rax] ;1 michael@0: movd xmm6, [rsi + rdx * 2] ;6 michael@0: lea rsi, [rsi + rax] michael@0: movd xmm7, [rsi + rdx * 2] ;7 michael@0: movd xmm2, [rsi + rax] ;2 michael@0: movd xmm3, [rsi + rax * 2] ;3 michael@0: movd xmm4, [rsi + rdx] ;4 michael@0: movd xmm5, [rsi + rax * 4] ;5 michael@0: michael@0: APPLY_FILTER_4 0 michael@0: michael@0: lea rdi, [rdi + rbx] michael@0: dec rcx michael@0: jnz .loop michael@0: michael@0: add rsp, 16 * 6 michael@0: pop rsp michael@0: pop rbx michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;void vp9_filter_block1d8_v8_sse2 michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; unsigned int src_pitch, michael@0: ; unsigned char *output_ptr, michael@0: ; unsigned int out_pitch, michael@0: ; unsigned int output_height, michael@0: ; short *filter michael@0: ;) michael@0: global sym(vp9_filter_block1d8_v8_sse2) PRIVATE michael@0: sym(vp9_filter_block1d8_v8_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16 * 10 michael@0: %define k0 [rsp + 16 * 0] michael@0: %define k1 [rsp + 16 * 1] michael@0: %define k2 [rsp + 16 * 2] michael@0: %define k3 [rsp + 16 * 3] michael@0: %define k4 [rsp + 16 * 4] michael@0: %define k5 [rsp + 16 * 5] michael@0: %define k6 [rsp + 16 * 6] michael@0: %define k7 [rsp + 16 * 7] michael@0: %define krd [rsp + 16 * 8] michael@0: %define zero [rsp + 16 * 9] michael@0: michael@0: GET_FILTERS michael@0: michael@0: movsxd rax, DWORD PTR arg(1) ;pixels_per_line michael@0: movsxd rbx, DWORD PTR arg(3) ;out_pitch michael@0: lea rdx, [rax + rax * 2] michael@0: movsxd rcx, DWORD PTR arg(4) ;output_height michael@0: michael@0: .loop: michael@0: LOAD_VERT_8 0 michael@0: APPLY_FILTER_8 0, 0 michael@0: michael@0: lea rdi, [rdi + rbx] michael@0: dec rcx michael@0: jnz .loop michael@0: michael@0: add rsp, 16 * 10 michael@0: pop rsp michael@0: pop rbx michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;void vp9_filter_block1d16_v8_sse2 michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; unsigned int src_pitch, michael@0: ; unsigned char *output_ptr, michael@0: ; unsigned int out_pitch, michael@0: ; unsigned int output_height, michael@0: ; short *filter michael@0: ;) michael@0: global sym(vp9_filter_block1d16_v8_sse2) PRIVATE michael@0: sym(vp9_filter_block1d16_v8_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16 * 10 michael@0: %define k0 [rsp + 16 * 0] michael@0: %define k1 [rsp + 16 * 1] michael@0: %define k2 [rsp + 16 * 2] michael@0: %define k3 [rsp + 16 * 3] michael@0: %define k4 [rsp + 16 * 4] michael@0: %define k5 [rsp + 16 * 5] michael@0: %define k6 [rsp + 16 * 6] michael@0: %define k7 [rsp + 16 * 7] michael@0: %define krd [rsp + 16 * 8] michael@0: %define zero [rsp + 16 * 9] michael@0: michael@0: GET_FILTERS michael@0: michael@0: movsxd rax, DWORD PTR arg(1) ;pixels_per_line michael@0: movsxd rbx, DWORD PTR arg(3) ;out_pitch michael@0: lea rdx, [rax + rax * 2] michael@0: movsxd rcx, DWORD PTR arg(4) ;output_height michael@0: michael@0: .loop: michael@0: LOAD_VERT_8 0 michael@0: APPLY_FILTER_8 0, 0 michael@0: sub rsi, rax michael@0: michael@0: LOAD_VERT_8 8 michael@0: APPLY_FILTER_8 0, 8 michael@0: add rdi, rbx michael@0: michael@0: dec rcx michael@0: jnz .loop michael@0: michael@0: add rsp, 16 * 10 michael@0: pop rsp michael@0: pop rbx michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE michael@0: sym(vp9_filter_block1d4_v8_avg_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16 * 6 michael@0: %define k0k1 [rsp + 16 * 0] michael@0: %define k2k3 [rsp + 16 * 1] michael@0: %define k5k4 [rsp + 16 * 2] michael@0: %define k6k7 [rsp + 16 * 3] michael@0: %define krd [rsp + 16 * 4] michael@0: %define zero [rsp + 16 * 5] michael@0: michael@0: GET_FILTERS_4 michael@0: michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(2) ;output_ptr michael@0: michael@0: movsxd rax, DWORD PTR arg(1) ;pixels_per_line michael@0: movsxd rbx, DWORD PTR arg(3) ;out_pitch michael@0: lea rdx, [rax + rax * 2] michael@0: movsxd rcx, DWORD PTR arg(4) ;output_height michael@0: michael@0: .loop: michael@0: movd xmm0, [rsi] ;load src: row 0 michael@0: movd xmm1, [rsi + rax] ;1 michael@0: movd xmm6, [rsi + rdx * 2] ;6 michael@0: lea rsi, [rsi + rax] michael@0: movd xmm7, [rsi + rdx * 2] ;7 michael@0: movd xmm2, [rsi + rax] ;2 michael@0: movd xmm3, [rsi + rax * 2] ;3 michael@0: movd xmm4, [rsi + rdx] ;4 michael@0: movd xmm5, [rsi + rax * 4] ;5 michael@0: michael@0: APPLY_FILTER_4 1 michael@0: michael@0: lea rdi, [rdi + rbx] michael@0: dec rcx michael@0: jnz .loop michael@0: michael@0: add rsp, 16 * 6 michael@0: pop rsp michael@0: pop rbx michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE michael@0: sym(vp9_filter_block1d8_v8_avg_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16 * 10 michael@0: %define k0 [rsp + 16 * 0] michael@0: %define k1 [rsp + 16 * 1] michael@0: %define k2 [rsp + 16 * 2] michael@0: %define k3 [rsp + 16 * 3] michael@0: %define k4 [rsp + 16 * 4] michael@0: %define k5 [rsp + 16 * 5] michael@0: %define k6 [rsp + 16 * 6] michael@0: %define k7 [rsp + 16 * 7] michael@0: %define krd [rsp + 16 * 8] michael@0: %define zero [rsp + 16 * 9] michael@0: michael@0: GET_FILTERS michael@0: michael@0: movsxd rax, DWORD PTR arg(1) ;pixels_per_line michael@0: movsxd rbx, DWORD PTR arg(3) ;out_pitch michael@0: lea rdx, [rax + rax * 2] michael@0: movsxd rcx, DWORD PTR arg(4) ;output_height michael@0: .loop: michael@0: LOAD_VERT_8 0 michael@0: APPLY_FILTER_8 1, 0 michael@0: michael@0: lea rdi, [rdi + rbx] michael@0: dec rcx michael@0: jnz .loop michael@0: michael@0: add rsp, 16 * 10 michael@0: pop rsp michael@0: pop rbx michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE michael@0: sym(vp9_filter_block1d16_v8_avg_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16 * 10 michael@0: %define k0 [rsp + 16 * 0] michael@0: %define k1 [rsp + 16 * 1] michael@0: %define k2 [rsp + 16 * 2] michael@0: %define k3 [rsp + 16 * 3] michael@0: %define k4 [rsp + 16 * 4] michael@0: %define k5 [rsp + 16 * 5] michael@0: %define k6 [rsp + 16 * 6] michael@0: %define k7 [rsp + 16 * 7] michael@0: %define krd [rsp + 16 * 8] michael@0: %define zero [rsp + 16 * 9] michael@0: michael@0: GET_FILTERS michael@0: michael@0: movsxd rax, DWORD PTR arg(1) ;pixels_per_line michael@0: movsxd rbx, DWORD PTR arg(3) ;out_pitch michael@0: lea rdx, [rax + rax * 2] michael@0: movsxd rcx, DWORD PTR arg(4) ;output_height michael@0: .loop: michael@0: LOAD_VERT_8 0 michael@0: APPLY_FILTER_8 1, 0 michael@0: sub rsi, rax michael@0: michael@0: LOAD_VERT_8 8 michael@0: APPLY_FILTER_8 1, 8 michael@0: add rdi, rbx michael@0: michael@0: dec rcx michael@0: jnz .loop michael@0: michael@0: add rsp, 16 * 10 michael@0: pop rsp michael@0: pop rbx michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;void vp9_filter_block1d4_h8_sse2 michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; unsigned int src_pixels_per_line, michael@0: ; unsigned char *output_ptr, michael@0: ; unsigned int output_pitch, michael@0: ; unsigned int output_height, michael@0: ; short *filter michael@0: ;) michael@0: global sym(vp9_filter_block1d4_h8_sse2) PRIVATE michael@0: sym(vp9_filter_block1d4_h8_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16 * 6 michael@0: %define k0k1 [rsp + 16 * 0] michael@0: %define k2k3 [rsp + 16 * 1] michael@0: %define k5k4 [rsp + 16 * 2] michael@0: %define k6k7 [rsp + 16 * 3] michael@0: %define krd [rsp + 16 * 4] michael@0: %define zero [rsp + 16 * 5] michael@0: michael@0: GET_FILTERS_4 michael@0: michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(2) ;output_ptr michael@0: michael@0: movsxd rax, DWORD PTR arg(1) ;pixels_per_line michael@0: movsxd rdx, DWORD PTR arg(3) ;out_pitch michael@0: movsxd rcx, DWORD PTR arg(4) ;output_height michael@0: michael@0: .loop: michael@0: movdqu xmm0, [rsi - 3] ;load src michael@0: michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm6, xmm0 michael@0: movdqa xmm7, xmm0 michael@0: movdqa xmm2, xmm0 michael@0: movdqa xmm3, xmm0 michael@0: movdqa xmm5, xmm0 michael@0: movdqa xmm4, xmm0 michael@0: michael@0: psrldq xmm1, 1 michael@0: psrldq xmm6, 6 michael@0: psrldq xmm7, 7 michael@0: psrldq xmm2, 2 michael@0: psrldq xmm3, 3 michael@0: psrldq xmm5, 5 michael@0: psrldq xmm4, 4 michael@0: michael@0: APPLY_FILTER_4 0 michael@0: michael@0: lea rsi, [rsi + rax] michael@0: lea rdi, [rdi + rdx] michael@0: dec rcx michael@0: jnz .loop michael@0: michael@0: add rsp, 16 * 6 michael@0: pop rsp michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;void vp9_filter_block1d8_h8_sse2 michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; unsigned int src_pixels_per_line, michael@0: ; unsigned char *output_ptr, michael@0: ; unsigned int output_pitch, michael@0: ; unsigned int output_height, michael@0: ; short *filter michael@0: ;) michael@0: global sym(vp9_filter_block1d8_h8_sse2) PRIVATE michael@0: sym(vp9_filter_block1d8_h8_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16 * 10 michael@0: %define k0 [rsp + 16 * 0] michael@0: %define k1 [rsp + 16 * 1] michael@0: %define k2 [rsp + 16 * 2] michael@0: %define k3 [rsp + 16 * 3] michael@0: %define k4 [rsp + 16 * 4] michael@0: %define k5 [rsp + 16 * 5] michael@0: %define k6 [rsp + 16 * 6] michael@0: %define k7 [rsp + 16 * 7] michael@0: %define krd [rsp + 16 * 8] michael@0: %define zero [rsp + 16 * 9] michael@0: michael@0: GET_FILTERS michael@0: michael@0: movsxd rax, DWORD PTR arg(1) ;pixels_per_line michael@0: movsxd rdx, DWORD PTR arg(3) ;out_pitch michael@0: movsxd rcx, DWORD PTR arg(4) ;output_height michael@0: michael@0: .loop: michael@0: movdqu xmm0, [rsi - 3] ;load src michael@0: michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm6, xmm0 michael@0: movdqa xmm7, xmm0 michael@0: movdqa xmm2, xmm0 michael@0: movdqa xmm5, xmm0 michael@0: movdqa xmm3, xmm0 michael@0: movdqa xmm4, xmm0 michael@0: michael@0: psrldq xmm1, 1 michael@0: psrldq xmm6, 6 michael@0: psrldq xmm7, 7 michael@0: psrldq xmm2, 2 michael@0: psrldq xmm5, 5 michael@0: psrldq xmm3, 3 michael@0: psrldq xmm4, 4 michael@0: michael@0: APPLY_FILTER_8 0, 0 michael@0: michael@0: lea rsi, [rsi + rax] michael@0: lea rdi, [rdi + rdx] michael@0: dec rcx michael@0: jnz .loop michael@0: michael@0: add rsp, 16 * 10 michael@0: pop rsp michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;void vp9_filter_block1d16_h8_sse2 michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; unsigned int src_pixels_per_line, michael@0: ; unsigned char *output_ptr, michael@0: ; unsigned int output_pitch, michael@0: ; unsigned int output_height, michael@0: ; short *filter michael@0: ;) michael@0: global sym(vp9_filter_block1d16_h8_sse2) PRIVATE michael@0: sym(vp9_filter_block1d16_h8_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16 * 10 michael@0: %define k0 [rsp + 16 * 0] michael@0: %define k1 [rsp + 16 * 1] michael@0: %define k2 [rsp + 16 * 2] michael@0: %define k3 [rsp + 16 * 3] michael@0: %define k4 [rsp + 16 * 4] michael@0: %define k5 [rsp + 16 * 5] michael@0: %define k6 [rsp + 16 * 6] michael@0: %define k7 [rsp + 16 * 7] michael@0: %define krd [rsp + 16 * 8] michael@0: %define zero [rsp + 16 * 9] michael@0: michael@0: GET_FILTERS michael@0: michael@0: movsxd rax, DWORD PTR arg(1) ;pixels_per_line michael@0: movsxd rdx, DWORD PTR arg(3) ;out_pitch michael@0: movsxd rcx, DWORD PTR arg(4) ;output_height michael@0: michael@0: .loop: michael@0: movdqu xmm0, [rsi - 3] ;load src michael@0: michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm6, xmm0 michael@0: movdqa xmm7, xmm0 michael@0: movdqa xmm2, xmm0 michael@0: movdqa xmm5, xmm0 michael@0: movdqa xmm3, xmm0 michael@0: movdqa xmm4, xmm0 michael@0: michael@0: psrldq xmm1, 1 michael@0: psrldq xmm6, 6 michael@0: psrldq xmm7, 7 michael@0: psrldq xmm2, 2 michael@0: psrldq xmm5, 5 michael@0: psrldq xmm3, 3 michael@0: psrldq xmm4, 4 michael@0: michael@0: APPLY_FILTER_8 0, 0 michael@0: michael@0: movdqu xmm0, [rsi + 5] ;load src michael@0: michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm6, xmm0 michael@0: movdqa xmm7, xmm0 michael@0: movdqa xmm2, xmm0 michael@0: movdqa xmm5, xmm0 michael@0: movdqa xmm3, xmm0 michael@0: movdqa xmm4, xmm0 michael@0: michael@0: psrldq xmm1, 1 michael@0: psrldq xmm6, 6 michael@0: psrldq xmm7, 7 michael@0: psrldq xmm2, 2 michael@0: psrldq xmm5, 5 michael@0: psrldq xmm3, 3 michael@0: psrldq xmm4, 4 michael@0: michael@0: APPLY_FILTER_8 0, 8 michael@0: michael@0: lea rsi, [rsi + rax] michael@0: lea rdi, [rdi + rdx] michael@0: dec rcx michael@0: jnz .loop michael@0: michael@0: add rsp, 16 * 10 michael@0: pop rsp michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE michael@0: sym(vp9_filter_block1d4_h8_avg_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16 * 6 michael@0: %define k0k1 [rsp + 16 * 0] michael@0: %define k2k3 [rsp + 16 * 1] michael@0: %define k5k4 [rsp + 16 * 2] michael@0: %define k6k7 [rsp + 16 * 3] michael@0: %define krd [rsp + 16 * 4] michael@0: %define zero [rsp + 16 * 5] michael@0: michael@0: GET_FILTERS_4 michael@0: michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(2) ;output_ptr michael@0: michael@0: movsxd rax, DWORD PTR arg(1) ;pixels_per_line michael@0: movsxd rdx, DWORD PTR arg(3) ;out_pitch michael@0: movsxd rcx, DWORD PTR arg(4) ;output_height michael@0: michael@0: .loop: michael@0: movdqu xmm0, [rsi - 3] ;load src michael@0: michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm6, xmm0 michael@0: movdqa xmm7, xmm0 michael@0: movdqa xmm2, xmm0 michael@0: movdqa xmm3, xmm0 michael@0: movdqa xmm5, xmm0 michael@0: movdqa xmm4, xmm0 michael@0: michael@0: psrldq xmm1, 1 michael@0: psrldq xmm6, 6 michael@0: psrldq xmm7, 7 michael@0: psrldq xmm2, 2 michael@0: psrldq xmm3, 3 michael@0: psrldq xmm5, 5 michael@0: psrldq xmm4, 4 michael@0: michael@0: APPLY_FILTER_4 1 michael@0: michael@0: lea rsi, [rsi + rax] michael@0: lea rdi, [rdi + rdx] michael@0: dec rcx michael@0: jnz .loop michael@0: michael@0: add rsp, 16 * 6 michael@0: pop rsp michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE michael@0: sym(vp9_filter_block1d8_h8_avg_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16 * 10 michael@0: %define k0 [rsp + 16 * 0] michael@0: %define k1 [rsp + 16 * 1] michael@0: %define k2 [rsp + 16 * 2] michael@0: %define k3 [rsp + 16 * 3] michael@0: %define k4 [rsp + 16 * 4] michael@0: %define k5 [rsp + 16 * 5] michael@0: %define k6 [rsp + 16 * 6] michael@0: %define k7 [rsp + 16 * 7] michael@0: %define krd [rsp + 16 * 8] michael@0: %define zero [rsp + 16 * 9] michael@0: michael@0: GET_FILTERS michael@0: michael@0: movsxd rax, DWORD PTR arg(1) ;pixels_per_line michael@0: movsxd rdx, DWORD PTR arg(3) ;out_pitch michael@0: movsxd rcx, DWORD PTR arg(4) ;output_height michael@0: michael@0: .loop: michael@0: movdqu xmm0, [rsi - 3] ;load src michael@0: michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm6, xmm0 michael@0: movdqa xmm7, xmm0 michael@0: movdqa xmm2, xmm0 michael@0: movdqa xmm5, xmm0 michael@0: movdqa xmm3, xmm0 michael@0: movdqa xmm4, xmm0 michael@0: michael@0: psrldq xmm1, 1 michael@0: psrldq xmm6, 6 michael@0: psrldq xmm7, 7 michael@0: psrldq xmm2, 2 michael@0: psrldq xmm5, 5 michael@0: psrldq xmm3, 3 michael@0: psrldq xmm4, 4 michael@0: michael@0: APPLY_FILTER_8 1, 0 michael@0: michael@0: lea rsi, [rsi + rax] michael@0: lea rdi, [rdi + rdx] michael@0: dec rcx michael@0: jnz .loop michael@0: michael@0: add rsp, 16 * 10 michael@0: pop rsp michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE michael@0: sym(vp9_filter_block1d16_h8_avg_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16 * 10 michael@0: %define k0 [rsp + 16 * 0] michael@0: %define k1 [rsp + 16 * 1] michael@0: %define k2 [rsp + 16 * 2] michael@0: %define k3 [rsp + 16 * 3] michael@0: %define k4 [rsp + 16 * 4] michael@0: %define k5 [rsp + 16 * 5] michael@0: %define k6 [rsp + 16 * 6] michael@0: %define k7 [rsp + 16 * 7] michael@0: %define krd [rsp + 16 * 8] michael@0: %define zero [rsp + 16 * 9] michael@0: michael@0: GET_FILTERS michael@0: michael@0: movsxd rax, DWORD PTR arg(1) ;pixels_per_line michael@0: movsxd rdx, DWORD PTR arg(3) ;out_pitch michael@0: movsxd rcx, DWORD PTR arg(4) ;output_height michael@0: michael@0: .loop: michael@0: movdqu xmm0, [rsi - 3] ;load src michael@0: michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm6, xmm0 michael@0: movdqa xmm7, xmm0 michael@0: movdqa xmm2, xmm0 michael@0: movdqa xmm5, xmm0 michael@0: movdqa xmm3, xmm0 michael@0: movdqa xmm4, xmm0 michael@0: michael@0: psrldq xmm1, 1 michael@0: psrldq xmm6, 6 michael@0: psrldq xmm7, 7 michael@0: psrldq xmm2, 2 michael@0: psrldq xmm5, 5 michael@0: psrldq xmm3, 3 michael@0: psrldq xmm4, 4 michael@0: michael@0: APPLY_FILTER_8 1, 0 michael@0: michael@0: movdqu xmm0, [rsi + 5] ;load src michael@0: michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm6, xmm0 michael@0: movdqa xmm7, xmm0 michael@0: movdqa xmm2, xmm0 michael@0: movdqa xmm5, xmm0 michael@0: movdqa xmm3, xmm0 michael@0: movdqa xmm4, xmm0 michael@0: michael@0: psrldq xmm1, 1 michael@0: psrldq xmm6, 6 michael@0: psrldq xmm7, 7 michael@0: psrldq xmm2, 2 michael@0: psrldq xmm5, 5 michael@0: psrldq xmm3, 3 michael@0: psrldq xmm4, 4 michael@0: michael@0: APPLY_FILTER_8 1, 8 michael@0: michael@0: lea rsi, [rsi + rax] michael@0: lea rdi, [rdi + rdx] michael@0: dec rcx michael@0: jnz .loop michael@0: michael@0: add rsp, 16 * 10 michael@0: pop rsp michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret