1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libyuv/source/row_x86.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,146 @@ 1.4 +; 1.5 +; Copyright 2012 The LibYuv Project Authors. All rights reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 +%ifdef __YASM_VERSION_ID__ 1.15 +%if __YASM_VERSION_ID__ < 01020000h 1.16 +%error AVX2 is supported only by yasm 1.2.0 or later. 1.17 +%endif 1.18 +%endif 1.19 +%include "x86inc.asm" 1.20 + 1.21 +SECTION .text 1.22 + 1.23 +; cglobal numeric constants are parameters, gpr regs, mm regs 1.24 + 1.25 +; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) 1.26 + 1.27 +%macro YUY2TOYROW 2-3 1.28 +cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix 1.29 +%ifidn %1,YUY2 1.30 + pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff 1.31 + psrlw m2, m2, 8 1.32 +%endif 1.33 + 1.34 + ALIGN 4 1.35 +.convertloop: 1.36 + mov%2 m0, [src_yuy2q] 1.37 + mov%2 m1, [src_yuy2q + mmsize] 1.38 + lea src_yuy2q, [src_yuy2q + mmsize * 2] 1.39 +%ifidn %1,YUY2 1.40 + pand m0, m0, m2 ; YUY2 even bytes are Y 1.41 + pand m1, m1, m2 1.42 +%else 1.43 + psrlw m0, m0, 8 ; UYVY odd bytes are Y 1.44 + psrlw m1, m1, 8 1.45 +%endif 1.46 + packuswb m0, m0, m1 1.47 +%if cpuflag(AVX2) 1.48 + vpermq m0, m0, 0xd8 1.49 +%endif 1.50 + sub pixd, mmsize 1.51 + mov%2 [dst_yq], m0 1.52 + lea dst_yq, [dst_yq + mmsize] 1.53 + jg .convertloop 1.54 + REP_RET 1.55 +%endmacro 1.56 + 1.57 +; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version. 1.58 +INIT_MMX MMX 1.59 +YUY2TOYROW YUY2,a, 1.60 +YUY2TOYROW YUY2,u,_Unaligned 1.61 +YUY2TOYROW UYVY,a, 1.62 +YUY2TOYROW UYVY,u,_Unaligned 1.63 +INIT_XMM SSE2 1.64 +YUY2TOYROW YUY2,a, 1.65 +YUY2TOYROW YUY2,u,_Unaligned 1.66 +YUY2TOYROW UYVY,a, 1.67 +YUY2TOYROW UYVY,u,_Unaligned 1.68 +INIT_YMM AVX2 1.69 +YUY2TOYROW YUY2,a, 1.70 +YUY2TOYROW UYVY,a, 1.71 + 1.72 +; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) 1.73 + 1.74 +%macro SplitUVRow 1-2 1.75 +cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix 1.76 + pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff 1.77 + psrlw m4, m4, 8 1.78 + sub dst_vq, dst_uq 1.79 + 1.80 + ALIGN 4 1.81 +.convertloop: 1.82 + mov%1 m0, [src_uvq] 1.83 + mov%1 m1, [src_uvq + mmsize] 1.84 + lea src_uvq, [src_uvq + mmsize * 2] 1.85 + psrlw m2, m0, 8 ; odd bytes 1.86 + psrlw m3, m1, 8 1.87 + pand m0, m0, m4 ; even bytes 1.88 + pand m1, m1, m4 1.89 + packuswb m0, m0, m1 1.90 + packuswb m2, m2, m3 1.91 +%if cpuflag(AVX2) 1.92 + vpermq m0, m0, 0xd8 1.93 + vpermq m2, m2, 0xd8 1.94 +%endif 1.95 + mov%1 [dst_uq], m0 1.96 + mov%1 [dst_uq + dst_vq], m2 1.97 + lea dst_uq, [dst_uq + mmsize] 1.98 + sub pixd, mmsize 1.99 + jg .convertloop 1.100 + REP_RET 1.101 +%endmacro 1.102 + 1.103 +INIT_MMX MMX 1.104 +SplitUVRow a, 1.105 +SplitUVRow u,_Unaligned 1.106 +INIT_XMM SSE2 1.107 +SplitUVRow a, 1.108 +SplitUVRow u,_Unaligned 1.109 +INIT_YMM AVX2 1.110 +SplitUVRow a, 1.111 + 1.112 +; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, 1.113 +; int width); 1.114 + 1.115 +%macro MergeUVRow_ 1-2 1.116 +cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix 1.117 + sub src_vq, src_uq 1.118 + 1.119 + ALIGN 4 1.120 +.convertloop: 1.121 + mov%1 m0, [src_uq] 1.122 + mov%1 m1, [src_vq] 1.123 + lea src_uq, [src_uq + mmsize] 1.124 + punpcklbw m2, m0, m1 // first 8 UV pairs 1.125 + punpckhbw m0, m0, m1 // next 8 UV pairs 1.126 +%if cpuflag(AVX2) 1.127 + vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0 1.128 + vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0 1.129 + mov%1 [dst_uvq], m1 1.130 + mov%1 [dst_uvq + mmsize], m2 1.131 +%else 1.132 + mov%1 [dst_uvq], m2 1.133 + mov%1 [dst_uvq + mmsize], m0 1.134 +%endif 1.135 + lea dst_uvq, [dst_uvq + mmsize * 2] 1.136 + sub pixd, mmsize 1.137 + jg .convertloop 1.138 + REP_RET 1.139 +%endmacro 1.140 + 1.141 +INIT_MMX MMX 1.142 +MergeUVRow_ a, 1.143 +MergeUVRow_ u,_Unaligned 1.144 +INIT_XMM SSE2 1.145 +MergeUVRow_ a, 1.146 +MergeUVRow_ u,_Unaligned 1.147 +INIT_YMM AVX2 1.148 +MergeUVRow_ a, 1.149 +