Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | ; |
michael@0 | 2 | ; Copyright 2012 The LibYuv Project Authors. All rights reserved. |
michael@0 | 3 | ; |
michael@0 | 4 | ; Use of this source code is governed by a BSD-style license |
michael@0 | 5 | ; that can be found in the LICENSE file in the root of the source |
michael@0 | 6 | ; tree. An additional intellectual property rights grant can be found |
michael@0 | 7 | ; in the file PATENTS. All contributing project authors may |
michael@0 | 8 | ; be found in the AUTHORS file in the root of the source tree. |
michael@0 | 9 | ; |
michael@0 | 10 | |
michael@0 | 11 | %ifdef __YASM_VERSION_ID__ |
michael@0 | 12 | %if __YASM_VERSION_ID__ < 01020000h |
michael@0 | 13 | %error AVX2 is supported only by yasm 1.2.0 or later. |
michael@0 | 14 | %endif |
michael@0 | 15 | %endif |
michael@0 | 16 | %include "x86inc.asm" |
michael@0 | 17 | |
michael@0 | 18 | SECTION .text |
michael@0 | 19 | |
michael@0 | 20 | ; cglobal numeric constants are parameters, gpr regs, mm regs |
michael@0 | 21 | |
michael@0 | 22 | ; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) |
michael@0 | 23 | |
michael@0 | 24 | %macro YUY2TOYROW 2-3 |
michael@0 | 25 | cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix |
michael@0 | 26 | %ifidn %1,YUY2 |
michael@0 | 27 | pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff |
michael@0 | 28 | psrlw m2, m2, 8 |
michael@0 | 29 | %endif |
michael@0 | 30 | |
michael@0 | 31 | ALIGN 4 |
michael@0 | 32 | .convertloop: |
michael@0 | 33 | mov%2 m0, [src_yuy2q] |
michael@0 | 34 | mov%2 m1, [src_yuy2q + mmsize] |
michael@0 | 35 | lea src_yuy2q, [src_yuy2q + mmsize * 2] |
michael@0 | 36 | %ifidn %1,YUY2 |
michael@0 | 37 | pand m0, m0, m2 ; YUY2 even bytes are Y |
michael@0 | 38 | pand m1, m1, m2 |
michael@0 | 39 | %else |
michael@0 | 40 | psrlw m0, m0, 8 ; UYVY odd bytes are Y |
michael@0 | 41 | psrlw m1, m1, 8 |
michael@0 | 42 | %endif |
michael@0 | 43 | packuswb m0, m0, m1 |
michael@0 | 44 | %if cpuflag(AVX2) |
michael@0 | 45 | vpermq m0, m0, 0xd8 |
michael@0 | 46 | %endif |
michael@0 | 47 | sub pixd, mmsize |
michael@0 | 48 | mov%2 [dst_yq], m0 |
michael@0 | 49 | lea dst_yq, [dst_yq + mmsize] |
michael@0 | 50 | jg .convertloop |
michael@0 | 51 | REP_RET |
michael@0 | 52 | %endmacro |
michael@0 | 53 | |
michael@0 | 54 | ; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version. |
michael@0 | 55 | INIT_MMX MMX |
michael@0 | 56 | YUY2TOYROW YUY2,a, |
michael@0 | 57 | YUY2TOYROW YUY2,u,_Unaligned |
michael@0 | 58 | YUY2TOYROW UYVY,a, |
michael@0 | 59 | YUY2TOYROW UYVY,u,_Unaligned |
michael@0 | 60 | INIT_XMM SSE2 |
michael@0 | 61 | YUY2TOYROW YUY2,a, |
michael@0 | 62 | YUY2TOYROW YUY2,u,_Unaligned |
michael@0 | 63 | YUY2TOYROW UYVY,a, |
michael@0 | 64 | YUY2TOYROW UYVY,u,_Unaligned |
michael@0 | 65 | INIT_YMM AVX2 |
michael@0 | 66 | YUY2TOYROW YUY2,a, |
michael@0 | 67 | YUY2TOYROW UYVY,a, |
michael@0 | 68 | |
michael@0 | 69 | ; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) |
michael@0 | 70 | |
michael@0 | 71 | %macro SplitUVRow 1-2 |
michael@0 | 72 | cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix |
michael@0 | 73 | pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff |
michael@0 | 74 | psrlw m4, m4, 8 |
michael@0 | 75 | sub dst_vq, dst_uq |
michael@0 | 76 | |
michael@0 | 77 | ALIGN 4 |
michael@0 | 78 | .convertloop: |
michael@0 | 79 | mov%1 m0, [src_uvq] |
michael@0 | 80 | mov%1 m1, [src_uvq + mmsize] |
michael@0 | 81 | lea src_uvq, [src_uvq + mmsize * 2] |
michael@0 | 82 | psrlw m2, m0, 8 ; odd bytes |
michael@0 | 83 | psrlw m3, m1, 8 |
michael@0 | 84 | pand m0, m0, m4 ; even bytes |
michael@0 | 85 | pand m1, m1, m4 |
michael@0 | 86 | packuswb m0, m0, m1 |
michael@0 | 87 | packuswb m2, m2, m3 |
michael@0 | 88 | %if cpuflag(AVX2) |
michael@0 | 89 | vpermq m0, m0, 0xd8 |
michael@0 | 90 | vpermq m2, m2, 0xd8 |
michael@0 | 91 | %endif |
michael@0 | 92 | mov%1 [dst_uq], m0 |
michael@0 | 93 | mov%1 [dst_uq + dst_vq], m2 |
michael@0 | 94 | lea dst_uq, [dst_uq + mmsize] |
michael@0 | 95 | sub pixd, mmsize |
michael@0 | 96 | jg .convertloop |
michael@0 | 97 | REP_RET |
michael@0 | 98 | %endmacro |
michael@0 | 99 | |
michael@0 | 100 | INIT_MMX MMX |
michael@0 | 101 | SplitUVRow a, |
michael@0 | 102 | SplitUVRow u,_Unaligned |
michael@0 | 103 | INIT_XMM SSE2 |
michael@0 | 104 | SplitUVRow a, |
michael@0 | 105 | SplitUVRow u,_Unaligned |
michael@0 | 106 | INIT_YMM AVX2 |
michael@0 | 107 | SplitUVRow a, |
michael@0 | 108 | |
michael@0 | 109 | ; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
michael@0 | 110 | ; int width); |
michael@0 | 111 | |
michael@0 | 112 | %macro MergeUVRow_ 1-2 |
michael@0 | 113 | cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix |
michael@0 | 114 | sub src_vq, src_uq |
michael@0 | 115 | |
michael@0 | 116 | ALIGN 4 |
michael@0 | 117 | .convertloop: |
michael@0 | 118 | mov%1 m0, [src_uq] |
michael@0 | 119 | mov%1 m1, [src_vq] |
michael@0 | 120 | lea src_uq, [src_uq + mmsize] |
michael@0 | 121 | punpcklbw m2, m0, m1 // first 8 UV pairs |
michael@0 | 122 | punpckhbw m0, m0, m1 // next 8 UV pairs |
michael@0 | 123 | %if cpuflag(AVX2) |
michael@0 | 124 | vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0 |
michael@0 | 125 | vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0 |
michael@0 | 126 | mov%1 [dst_uvq], m1 |
michael@0 | 127 | mov%1 [dst_uvq + mmsize], m2 |
michael@0 | 128 | %else |
michael@0 | 129 | mov%1 [dst_uvq], m2 |
michael@0 | 130 | mov%1 [dst_uvq + mmsize], m0 |
michael@0 | 131 | %endif |
michael@0 | 132 | lea dst_uvq, [dst_uvq + mmsize * 2] |
michael@0 | 133 | sub pixd, mmsize |
michael@0 | 134 | jg .convertloop |
michael@0 | 135 | REP_RET |
michael@0 | 136 | %endmacro |
michael@0 | 137 | |
michael@0 | 138 | INIT_MMX MMX |
michael@0 | 139 | MergeUVRow_ a, |
michael@0 | 140 | MergeUVRow_ u,_Unaligned |
michael@0 | 141 | INIT_XMM SSE2 |
michael@0 | 142 | MergeUVRow_ a, |
michael@0 | 143 | MergeUVRow_ u,_Unaligned |
michael@0 | 144 | INIT_YMM AVX2 |
michael@0 | 145 | MergeUVRow_ a, |
michael@0 | 146 |