1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libyuv/source/x86inc.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1136 @@ 1.4 +;***************************************************************************** 1.5 +;* x86inc.asm: x264asm abstraction layer 1.6 +;***************************************************************************** 1.7 +;* Copyright (C) 2005-2012 x264 project 1.8 +;* 1.9 +;* Authors: Loren Merritt <lorenm@u.washington.edu> 1.10 +;* Anton Mitrofanov <BugMaster@narod.ru> 1.11 +;* Jason Garrett-Glaser <darkshikari@gmail.com> 1.12 +;* Henrik Gramner <hengar-6@student.ltu.se> 1.13 +;* 1.14 +;* Permission to use, copy, modify, and/or distribute this software for any 1.15 +;* purpose with or without fee is hereby granted, provided that the above 1.16 +;* copyright notice and this permission notice appear in all copies. 1.17 +;* 1.18 +;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 1.19 +;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 1.20 +;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 1.21 +;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 1.22 +;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 1.23 +;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 1.24 +;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 1.25 +;***************************************************************************** 1.26 + 1.27 +; This is a header file for the x264ASM assembly language, which uses 1.28 +; NASM/YASM syntax combined with a large number of macros to provide easy 1.29 +; abstraction between different calling conventions (x86_32, win64, linux64). 1.30 +; It also has various other useful features to simplify writing the kind of 1.31 +; DSP functions that are most often used in x264. 1.32 + 1.33 +; Unlike the rest of x264, this file is available under an ISC license, as it 1.34 +; has significant usefulness outside of x264 and we want it to be available 1.35 +; to the largest audience possible. Of course, if you modify it for your own 1.36 +; purposes to add a new feature, we strongly encourage contributing a patch 1.37 +; as this feature might be useful for others as well. Send patches or ideas 1.38 +; to x264-devel@videolan.org . 1.39 + 1.40 +; Local changes for libyuv: 1.41 +; remove %define program_name and references in labels 1.42 +; rename cpus to uppercase 1.43 + 1.44 +%define WIN64 0 1.45 +%define UNIX64 0 1.46 +%if ARCH_X86_64 1.47 + %ifidn __OUTPUT_FORMAT__,win32 1.48 + %define WIN64 1 1.49 + %elifidn __OUTPUT_FORMAT__,win64 1.50 + %define WIN64 1 1.51 + %else 1.52 + %define UNIX64 1 1.53 + %endif 1.54 +%endif 1.55 + 1.56 +%ifdef PREFIX 1.57 + %define mangle(x) _ %+ x 1.58 +%else 1.59 + %define mangle(x) x 1.60 +%endif 1.61 + 1.62 +; Name of the .rodata section. 1.63 +; Kludge: Something on OS X fails to align .rodata even given an align attribute, 1.64 +; so use a different read-only section. 1.65 +%macro SECTION_RODATA 0-1 16 1.66 + %ifidn __OUTPUT_FORMAT__,macho64 1.67 + SECTION .text align=%1 1.68 + %elifidn __OUTPUT_FORMAT__,macho 1.69 + SECTION .text align=%1 1.70 + fakegot: 1.71 + %elifidn __OUTPUT_FORMAT__,aout 1.72 + section .text 1.73 + %else 1.74 + SECTION .rodata align=%1 1.75 + %endif 1.76 +%endmacro 1.77 + 1.78 +; aout does not support align= 1.79 +%macro SECTION_TEXT 0-1 16 1.80 + %ifidn __OUTPUT_FORMAT__,aout 1.81 + SECTION .text 1.82 + %else 1.83 + SECTION .text align=%1 1.84 + %endif 1.85 +%endmacro 1.86 + 1.87 +%if WIN64 1.88 + %define PIC 1.89 +%elif ARCH_X86_64 == 0 1.90 +; x86_32 doesn't require PIC. 1.91 +; Some distros prefer shared objects to be PIC, but nothing breaks if 1.92 +; the code contains a few textrels, so we'll skip that complexity. 1.93 + %undef PIC 1.94 +%endif 1.95 +%ifdef PIC 1.96 + default rel 1.97 +%endif 1.98 + 1.99 +; Always use long nops (reduces 0x90 spam in disassembly on x86_32) 1.100 +CPU amdnop 1.101 + 1.102 +; Macros to eliminate most code duplication between x86_32 and x86_64: 1.103 +; Currently this works only for leaf functions which load all their arguments 1.104 +; into registers at the start, and make no other use of the stack. Luckily that 1.105 +; covers most of x264's asm. 1.106 + 1.107 +; PROLOGUE: 1.108 +; %1 = number of arguments. loads them from stack if needed. 1.109 +; %2 = number of registers used. pushes callee-saved regs if needed. 1.110 +; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. 1.111 +; %4 = list of names to define to registers 1.112 +; PROLOGUE can also be invoked by adding the same options to cglobal 1.113 + 1.114 +; e.g. 1.115 +; cglobal foo, 2,3,0, dst, src, tmp 1.116 +; declares a function (foo), taking two args (dst and src) and one local variable (tmp) 1.117 + 1.118 +; TODO Some functions can use some args directly from the stack. If they're the 1.119 +; last args then you can just not declare them, but if they're in the middle 1.120 +; we need more flexible macro. 1.121 + 1.122 +; RET: 1.123 +; Pops anything that was pushed by PROLOGUE, and returns. 1.124 + 1.125 +; REP_RET: 1.126 +; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons 1.127 +; which are slow when a normal ret follows a branch. 1.128 + 1.129 +; registers: 1.130 +; rN and rNq are the native-size register holding function argument N 1.131 +; rNd, rNw, rNb are dword, word, and byte size 1.132 +; rNh is the high 8 bits of the word size 1.133 +; rNm is the original location of arg N (a register or on the stack), dword 1.134 +; rNmp is native size 1.135 + 1.136 +%macro DECLARE_REG 2-3 1.137 + %define r%1q %2 1.138 + %define r%1d %2d 1.139 + %define r%1w %2w 1.140 + %define r%1b %2b 1.141 + %define r%1h %2h 1.142 + %if %0 == 2 1.143 + %define r%1m %2d 1.144 + %define r%1mp %2 1.145 + %elif ARCH_X86_64 ; memory 1.146 + %define r%1m [rsp + stack_offset + %3] 1.147 + %define r%1mp qword r %+ %1m 1.148 + %else 1.149 + %define r%1m [esp + stack_offset + %3] 1.150 + %define r%1mp dword r %+ %1m 1.151 + %endif 1.152 + %define r%1 %2 1.153 +%endmacro 1.154 + 1.155 +%macro DECLARE_REG_SIZE 3 1.156 + %define r%1q r%1 1.157 + %define e%1q r%1 1.158 + %define r%1d e%1 1.159 + %define e%1d e%1 1.160 + %define r%1w %1 1.161 + %define e%1w %1 1.162 + %define r%1h %3 1.163 + %define e%1h %3 1.164 + %define r%1b %2 1.165 + %define e%1b %2 1.166 +%if ARCH_X86_64 == 0 1.167 + %define r%1 e%1 1.168 +%endif 1.169 +%endmacro 1.170 + 1.171 +DECLARE_REG_SIZE ax, al, ah 1.172 +DECLARE_REG_SIZE bx, bl, bh 1.173 +DECLARE_REG_SIZE cx, cl, ch 1.174 +DECLARE_REG_SIZE dx, dl, dh 1.175 +DECLARE_REG_SIZE si, sil, null 1.176 +DECLARE_REG_SIZE di, dil, null 1.177 +DECLARE_REG_SIZE bp, bpl, null 1.178 + 1.179 +; t# defines for when per-arch register allocation is more complex than just function arguments 1.180 + 1.181 +%macro DECLARE_REG_TMP 1-* 1.182 + %assign %%i 0 1.183 + %rep %0 1.184 + CAT_XDEFINE t, %%i, r%1 1.185 + %assign %%i %%i+1 1.186 + %rotate 1 1.187 + %endrep 1.188 +%endmacro 1.189 + 1.190 +%macro DECLARE_REG_TMP_SIZE 0-* 1.191 + %rep %0 1.192 + %define t%1q t%1 %+ q 1.193 + %define t%1d t%1 %+ d 1.194 + %define t%1w t%1 %+ w 1.195 + %define t%1h t%1 %+ h 1.196 + %define t%1b t%1 %+ b 1.197 + %rotate 1 1.198 + %endrep 1.199 +%endmacro 1.200 + 1.201 +DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 1.202 + 1.203 +%if ARCH_X86_64 1.204 + %define gprsize 8 1.205 +%else 1.206 + %define gprsize 4 1.207 +%endif 1.208 + 1.209 +%macro PUSH 1 1.210 + push %1 1.211 + %assign stack_offset stack_offset+gprsize 1.212 +%endmacro 1.213 + 1.214 +%macro POP 1 1.215 + pop %1 1.216 + %assign stack_offset stack_offset-gprsize 1.217 +%endmacro 1.218 + 1.219 +%macro PUSH_IF_USED 1-* 1.220 + %rep %0 1.221 + %if %1 < regs_used 1.222 + PUSH r%1 1.223 + %endif 1.224 + %rotate 1 1.225 + %endrep 1.226 +%endmacro 1.227 + 1.228 +%macro POP_IF_USED 1-* 1.229 + %rep %0 1.230 + %if %1 < regs_used 1.231 + pop r%1 1.232 + %endif 1.233 + %rotate 1 1.234 + %endrep 1.235 +%endmacro 1.236 + 1.237 +%macro LOAD_IF_USED 1-* 1.238 + %rep %0 1.239 + %if %1 < num_args 1.240 + mov r%1, r %+ %1 %+ mp 1.241 + %endif 1.242 + %rotate 1 1.243 + %endrep 1.244 +%endmacro 1.245 + 1.246 +%macro SUB 2 1.247 + sub %1, %2 1.248 + %ifidn %1, rsp 1.249 + %assign stack_offset stack_offset+(%2) 1.250 + %endif 1.251 +%endmacro 1.252 + 1.253 +%macro ADD 2 1.254 + add %1, %2 1.255 + %ifidn %1, rsp 1.256 + %assign stack_offset stack_offset-(%2) 1.257 + %endif 1.258 +%endmacro 1.259 + 1.260 +%macro movifnidn 2 1.261 + %ifnidn %1, %2 1.262 + mov %1, %2 1.263 + %endif 1.264 +%endmacro 1.265 + 1.266 +%macro movsxdifnidn 2 1.267 + %ifnidn %1, %2 1.268 + movsxd %1, %2 1.269 + %endif 1.270 +%endmacro 1.271 + 1.272 +%macro ASSERT 1 1.273 + %if (%1) == 0 1.274 + %error assert failed 1.275 + %endif 1.276 +%endmacro 1.277 + 1.278 +%macro DEFINE_ARGS 0-* 1.279 + %ifdef n_arg_names 1.280 + %assign %%i 0 1.281 + %rep n_arg_names 1.282 + CAT_UNDEF arg_name %+ %%i, q 1.283 + CAT_UNDEF arg_name %+ %%i, d 1.284 + CAT_UNDEF arg_name %+ %%i, w 1.285 + CAT_UNDEF arg_name %+ %%i, h 1.286 + CAT_UNDEF arg_name %+ %%i, b 1.287 + CAT_UNDEF arg_name %+ %%i, m 1.288 + CAT_UNDEF arg_name %+ %%i, mp 1.289 + CAT_UNDEF arg_name, %%i 1.290 + %assign %%i %%i+1 1.291 + %endrep 1.292 + %endif 1.293 + 1.294 + %xdefine %%stack_offset stack_offset 1.295 + %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine 1.296 + %assign %%i 0 1.297 + %rep %0 1.298 + %xdefine %1q r %+ %%i %+ q 1.299 + %xdefine %1d r %+ %%i %+ d 1.300 + %xdefine %1w r %+ %%i %+ w 1.301 + %xdefine %1h r %+ %%i %+ h 1.302 + %xdefine %1b r %+ %%i %+ b 1.303 + %xdefine %1m r %+ %%i %+ m 1.304 + %xdefine %1mp r %+ %%i %+ mp 1.305 + CAT_XDEFINE arg_name, %%i, %1 1.306 + %assign %%i %%i+1 1.307 + %rotate 1 1.308 + %endrep 1.309 + %xdefine stack_offset %%stack_offset 1.310 + %assign n_arg_names %0 1.311 +%endmacro 1.312 + 1.313 +%if WIN64 ; Windows x64 ;================================================= 1.314 + 1.315 +DECLARE_REG 0, rcx 1.316 +DECLARE_REG 1, rdx 1.317 +DECLARE_REG 2, R8 1.318 +DECLARE_REG 3, R9 1.319 +DECLARE_REG 4, R10, 40 1.320 +DECLARE_REG 5, R11, 48 1.321 +DECLARE_REG 6, rax, 56 1.322 +DECLARE_REG 7, rdi, 64 1.323 +DECLARE_REG 8, rsi, 72 1.324 +DECLARE_REG 9, rbx, 80 1.325 +DECLARE_REG 10, rbp, 88 1.326 +DECLARE_REG 11, R12, 96 1.327 +DECLARE_REG 12, R13, 104 1.328 +DECLARE_REG 13, R14, 112 1.329 +DECLARE_REG 14, R15, 120 1.330 + 1.331 +%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... 1.332 + %assign num_args %1 1.333 + %assign regs_used %2 1.334 + ASSERT regs_used >= num_args 1.335 + ASSERT regs_used <= 15 1.336 + PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 1.337 + %if mmsize == 8 1.338 + %assign xmm_regs_used 0 1.339 + %else 1.340 + WIN64_SPILL_XMM %3 1.341 + %endif 1.342 + LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 1.343 + DEFINE_ARGS %4 1.344 +%endmacro 1.345 + 1.346 +%macro WIN64_SPILL_XMM 1 1.347 + %assign xmm_regs_used %1 1.348 + ASSERT xmm_regs_used <= 16 1.349 + %if xmm_regs_used > 6 1.350 + SUB rsp, (xmm_regs_used-6)*16+16 1.351 + %assign %%i xmm_regs_used 1.352 + %rep (xmm_regs_used-6) 1.353 + %assign %%i %%i-1 1.354 + movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i 1.355 + %endrep 1.356 + %endif 1.357 +%endmacro 1.358 + 1.359 +%macro WIN64_RESTORE_XMM_INTERNAL 1 1.360 + %if xmm_regs_used > 6 1.361 + %assign %%i xmm_regs_used 1.362 + %rep (xmm_regs_used-6) 1.363 + %assign %%i %%i-1 1.364 + movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] 1.365 + %endrep 1.366 + add %1, (xmm_regs_used-6)*16+16 1.367 + %endif 1.368 +%endmacro 1.369 + 1.370 +%macro WIN64_RESTORE_XMM 1 1.371 + WIN64_RESTORE_XMM_INTERNAL %1 1.372 + %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 1.373 + %assign xmm_regs_used 0 1.374 +%endmacro 1.375 + 1.376 +%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 1.377 + 1.378 +%macro RET 0 1.379 + WIN64_RESTORE_XMM_INTERNAL rsp 1.380 + POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 1.381 +%if mmsize == 32 1.382 + vzeroupper 1.383 +%endif 1.384 + ret 1.385 +%endmacro 1.386 + 1.387 +%elif ARCH_X86_64 ; *nix x64 ;============================================= 1.388 + 1.389 +DECLARE_REG 0, rdi 1.390 +DECLARE_REG 1, rsi 1.391 +DECLARE_REG 2, rdx 1.392 +DECLARE_REG 3, rcx 1.393 +DECLARE_REG 4, R8 1.394 +DECLARE_REG 5, R9 1.395 +DECLARE_REG 6, rax, 8 1.396 +DECLARE_REG 7, R10, 16 1.397 +DECLARE_REG 8, R11, 24 1.398 +DECLARE_REG 9, rbx, 32 1.399 +DECLARE_REG 10, rbp, 40 1.400 +DECLARE_REG 11, R12, 48 1.401 +DECLARE_REG 12, R13, 56 1.402 +DECLARE_REG 13, R14, 64 1.403 +DECLARE_REG 14, R15, 72 1.404 + 1.405 +%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... 1.406 + %assign num_args %1 1.407 + %assign regs_used %2 1.408 + ASSERT regs_used >= num_args 1.409 + ASSERT regs_used <= 15 1.410 + PUSH_IF_USED 9, 10, 11, 12, 13, 14 1.411 + LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 1.412 + DEFINE_ARGS %4 1.413 +%endmacro 1.414 + 1.415 +%define has_epilogue regs_used > 9 || mmsize == 32 1.416 + 1.417 +%macro RET 0 1.418 + POP_IF_USED 14, 13, 12, 11, 10, 9 1.419 +%if mmsize == 32 1.420 + vzeroupper 1.421 +%endif 1.422 + ret 1.423 +%endmacro 1.424 + 1.425 +%else ; X86_32 ;============================================================== 1.426 + 1.427 +DECLARE_REG 0, eax, 4 1.428 +DECLARE_REG 1, ecx, 8 1.429 +DECLARE_REG 2, edx, 12 1.430 +DECLARE_REG 3, ebx, 16 1.431 +DECLARE_REG 4, esi, 20 1.432 +DECLARE_REG 5, edi, 24 1.433 +DECLARE_REG 6, ebp, 28 1.434 +%define rsp esp 1.435 + 1.436 +%macro DECLARE_ARG 1-* 1.437 + %rep %0 1.438 + %define r%1m [esp + stack_offset + 4*%1 + 4] 1.439 + %define r%1mp dword r%1m 1.440 + %rotate 1 1.441 + %endrep 1.442 +%endmacro 1.443 + 1.444 +DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 1.445 + 1.446 +%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... 1.447 + %assign num_args %1 1.448 + %assign regs_used %2 1.449 + %if regs_used > 7 1.450 + %assign regs_used 7 1.451 + %endif 1.452 + ASSERT regs_used >= num_args 1.453 + PUSH_IF_USED 3, 4, 5, 6 1.454 + LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 1.455 + DEFINE_ARGS %4 1.456 +%endmacro 1.457 + 1.458 +%define has_epilogue regs_used > 3 || mmsize == 32 1.459 + 1.460 +%macro RET 0 1.461 + POP_IF_USED 6, 5, 4, 3 1.462 +%if mmsize == 32 1.463 + vzeroupper 1.464 +%endif 1.465 + ret 1.466 +%endmacro 1.467 + 1.468 +%endif ;====================================================================== 1.469 + 1.470 +%if WIN64 == 0 1.471 +%macro WIN64_SPILL_XMM 1 1.472 +%endmacro 1.473 +%macro WIN64_RESTORE_XMM 1 1.474 +%endmacro 1.475 +%endif 1.476 + 1.477 +%macro REP_RET 0 1.478 + %if has_epilogue 1.479 + RET 1.480 + %else 1.481 + rep ret 1.482 + %endif 1.483 +%endmacro 1.484 + 1.485 +%macro TAIL_CALL 2 ; callee, is_nonadjacent 1.486 + %if has_epilogue 1.487 + call %1 1.488 + RET 1.489 + %elif %2 1.490 + jmp %1 1.491 + %endif 1.492 +%endmacro 1.493 + 1.494 +;============================================================================= 1.495 +; arch-independent part 1.496 +;============================================================================= 1.497 + 1.498 +%assign function_align 16 1.499 + 1.500 +; Begin a function. 1.501 +; Applies any symbol mangling needed for C linkage, and sets up a define such that 1.502 +; subsequent uses of the function name automatically refer to the mangled version. 1.503 +; Appends cpuflags to the function name if cpuflags has been specified. 1.504 +%macro cglobal 1-2+ ; name, [PROLOGUE args] 1.505 +%if %0 == 1 1.506 + cglobal_internal %1 %+ SUFFIX 1.507 +%else 1.508 + cglobal_internal %1 %+ SUFFIX, %2 1.509 +%endif 1.510 +%endmacro 1.511 +%macro cglobal_internal 1-2+ 1.512 + %ifndef cglobaled_%1 1.513 + %xdefine %1 mangle(%1) 1.514 + %xdefine %1.skip_prologue %1 %+ .skip_prologue 1.515 + CAT_XDEFINE cglobaled_, %1, 1 1.516 + %endif 1.517 + %xdefine current_function %1 1.518 + %ifidn __OUTPUT_FORMAT__,elf 1.519 + global %1:function hidden 1.520 + %else 1.521 + global %1 1.522 + %endif 1.523 + align function_align 1.524 + %1: 1.525 + RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer 1.526 + %assign stack_offset 0 1.527 + %if %0 > 1 1.528 + PROLOGUE %2 1.529 + %endif 1.530 +%endmacro 1.531 + 1.532 +%macro cextern 1 1.533 + %xdefine %1 mangle(%1) 1.534 + CAT_XDEFINE cglobaled_, %1, 1 1.535 + extern %1 1.536 +%endmacro 1.537 + 1.538 +; like cextern, but without the prefix 1.539 +%macro cextern_naked 1 1.540 + %xdefine %1 mangle(%1) 1.541 + CAT_XDEFINE cglobaled_, %1, 1 1.542 + extern %1 1.543 +%endmacro 1.544 + 1.545 +%macro const 2+ 1.546 + %xdefine %1 mangle(%1) 1.547 + global %1 1.548 + %1: %2 1.549 +%endmacro 1.550 + 1.551 +; This is needed for ELF, otherwise the GNU linker assumes the stack is 1.552 +; executable by default. 1.553 +%ifidn __OUTPUT_FORMAT__,elf 1.554 +SECTION .note.GNU-stack noalloc noexec nowrite progbits 1.555 +%endif 1.556 +%ifidn __OUTPUT_FORMAT__,elf32 1.557 +section .note.GNU-stack noalloc noexec nowrite progbits 1.558 +%endif 1.559 +%ifidn __OUTPUT_FORMAT__,elf64 1.560 +section .note.GNU-stack noalloc noexec nowrite progbits 1.561 +%endif 1.562 + 1.563 +; cpuflags 1.564 + 1.565 +%assign cpuflags_MMX (1<<0) 1.566 +%assign cpuflags_MMX2 (1<<1) | cpuflags_MMX 1.567 +%assign cpuflags_3dnow (1<<2) | cpuflags_MMX 1.568 +%assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow 1.569 +%assign cpuflags_SSE (1<<4) | cpuflags_MMX2 1.570 +%assign cpuflags_SSE2 (1<<5) | cpuflags_SSE 1.571 +%assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2 1.572 +%assign cpuflags_SSE3 (1<<7) | cpuflags_SSE2 1.573 +%assign cpuflags_SSSE3 (1<<8) | cpuflags_SSE3 1.574 +%assign cpuflags_SSE4 (1<<9) | cpuflags_SSSE3 1.575 +%assign cpuflags_SSE42 (1<<10)| cpuflags_SSE4 1.576 +%assign cpuflags_AVX (1<<11)| cpuflags_SSE42 1.577 +%assign cpuflags_xop (1<<12)| cpuflags_AVX 1.578 +%assign cpuflags_fma4 (1<<13)| cpuflags_AVX 1.579 +%assign cpuflags_AVX2 (1<<14)| cpuflags_AVX 1.580 +%assign cpuflags_fma3 (1<<15)| cpuflags_AVX 1.581 + 1.582 +%assign cpuflags_cache32 (1<<16) 1.583 +%assign cpuflags_cache64 (1<<17) 1.584 +%assign cpuflags_slowctz (1<<18) 1.585 +%assign cpuflags_lzcnt (1<<19) 1.586 +%assign cpuflags_misalign (1<<20) 1.587 +%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant 1.588 +%assign cpuflags_atom (1<<22) 1.589 +%assign cpuflags_bmi1 (1<<23) 1.590 +%assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1 1.591 +%assign cpuflags_tbm (1<<25)|cpuflags_bmi1 1.592 + 1.593 +%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) 1.594 +%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) 1.595 + 1.596 +; Takes up to 2 cpuflags from the above list. 1.597 +; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. 1.598 +; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. 1.599 +%macro INIT_CPUFLAGS 0-2 1.600 + %if %0 >= 1 1.601 + %xdefine cpuname %1 1.602 + %assign cpuflags cpuflags_%1 1.603 + %if %0 >= 2 1.604 + %xdefine cpuname %1_%2 1.605 + %assign cpuflags cpuflags | cpuflags_%2 1.606 + %endif 1.607 + %xdefine SUFFIX _ %+ cpuname 1.608 + %if cpuflag(AVX) 1.609 + %assign AVX_enabled 1 1.610 + %endif 1.611 + %if mmsize == 16 && notcpuflag(SSE2) 1.612 + %define mova movaps 1.613 + %define movu movups 1.614 + %define movnta movntps 1.615 + %endif 1.616 + %if cpuflag(aligned) 1.617 + %define movu mova 1.618 + %elifidn %1, SSE3 1.619 + %define movu lddqu 1.620 + %endif 1.621 + %else 1.622 + %xdefine SUFFIX 1.623 + %undef cpuname 1.624 + %undef cpuflags 1.625 + %endif 1.626 +%endmacro 1.627 + 1.628 +; merge MMX and SSE* 1.629 + 1.630 +%macro CAT_XDEFINE 3 1.631 + %xdefine %1%2 %3 1.632 +%endmacro 1.633 + 1.634 +%macro CAT_UNDEF 2 1.635 + %undef %1%2 1.636 +%endmacro 1.637 + 1.638 +%macro INIT_MMX 0-1+ 1.639 + %assign AVX_enabled 0 1.640 + %define RESET_MM_PERMUTATION INIT_MMX %1 1.641 + %define mmsize 8 1.642 + %define num_mmregs 8 1.643 + %define mova movq 1.644 + %define movu movq 1.645 + %define movh movd 1.646 + %define movnta movntq 1.647 + %assign %%i 0 1.648 + %rep 8 1.649 + CAT_XDEFINE m, %%i, mm %+ %%i 1.650 + CAT_XDEFINE nmm, %%i, %%i 1.651 + %assign %%i %%i+1 1.652 + %endrep 1.653 + %rep 8 1.654 + CAT_UNDEF m, %%i 1.655 + CAT_UNDEF nmm, %%i 1.656 + %assign %%i %%i+1 1.657 + %endrep 1.658 + INIT_CPUFLAGS %1 1.659 +%endmacro 1.660 + 1.661 +%macro INIT_XMM 0-1+ 1.662 + %assign AVX_enabled 0 1.663 + %define RESET_MM_PERMUTATION INIT_XMM %1 1.664 + %define mmsize 16 1.665 + %define num_mmregs 8 1.666 + %if ARCH_X86_64 1.667 + %define num_mmregs 16 1.668 + %endif 1.669 + %define mova movdqa 1.670 + %define movu movdqu 1.671 + %define movh movq 1.672 + %define movnta movntdq 1.673 + %assign %%i 0 1.674 + %rep num_mmregs 1.675 + CAT_XDEFINE m, %%i, xmm %+ %%i 1.676 + CAT_XDEFINE nxmm, %%i, %%i 1.677 + %assign %%i %%i+1 1.678 + %endrep 1.679 + INIT_CPUFLAGS %1 1.680 +%endmacro 1.681 + 1.682 +%macro INIT_YMM 0-1+ 1.683 + %assign AVX_enabled 1 1.684 + %define RESET_MM_PERMUTATION INIT_YMM %1 1.685 + %define mmsize 32 1.686 + %define num_mmregs 8 1.687 + %if ARCH_X86_64 1.688 + %define num_mmregs 16 1.689 + %endif 1.690 + %define mova vmovaps 1.691 + %define movu vmovups 1.692 + %undef movh 1.693 + %define movnta vmovntps 1.694 + %assign %%i 0 1.695 + %rep num_mmregs 1.696 + CAT_XDEFINE m, %%i, ymm %+ %%i 1.697 + CAT_XDEFINE nymm, %%i, %%i 1.698 + %assign %%i %%i+1 1.699 + %endrep 1.700 + INIT_CPUFLAGS %1 1.701 +%endmacro 1.702 + 1.703 +INIT_XMM 1.704 + 1.705 +; I often want to use macros that permute their arguments. e.g. there's no 1.706 +; efficient way to implement butterfly or transpose or dct without swapping some 1.707 +; arguments. 1.708 +; 1.709 +; I would like to not have to manually keep track of the permutations: 1.710 +; If I insert a permutation in the middle of a function, it should automatically 1.711 +; change everything that follows. For more complex macros I may also have multiple 1.712 +; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. 1.713 +; 1.714 +; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that 1.715 +; permutes its arguments. It's equivalent to exchanging the contents of the 1.716 +; registers, except that this way you exchange the register names instead, so it 1.717 +; doesn't cost any cycles. 1.718 + 1.719 +%macro PERMUTE 2-* ; takes a list of pairs to swap 1.720 +%rep %0/2 1.721 + %xdefine tmp%2 m%2 1.722 + %xdefine ntmp%2 nm%2 1.723 + %rotate 2 1.724 +%endrep 1.725 +%rep %0/2 1.726 + %xdefine m%1 tmp%2 1.727 + %xdefine nm%1 ntmp%2 1.728 + %undef tmp%2 1.729 + %undef ntmp%2 1.730 + %rotate 2 1.731 +%endrep 1.732 +%endmacro 1.733 + 1.734 +%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) 1.735 +%rep %0-1 1.736 +%ifdef m%1 1.737 + %xdefine tmp m%1 1.738 + %xdefine m%1 m%2 1.739 + %xdefine m%2 tmp 1.740 + CAT_XDEFINE n, m%1, %1 1.741 + CAT_XDEFINE n, m%2, %2 1.742 +%else 1.743 + ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. 1.744 + ; Be careful using this mode in nested macros though, as in some cases there may be 1.745 + ; other copies of m# that have already been dereferenced and don't get updated correctly. 1.746 + %xdefine %%n1 n %+ %1 1.747 + %xdefine %%n2 n %+ %2 1.748 + %xdefine tmp m %+ %%n1 1.749 + CAT_XDEFINE m, %%n1, m %+ %%n2 1.750 + CAT_XDEFINE m, %%n2, tmp 1.751 + CAT_XDEFINE n, m %+ %%n1, %%n1 1.752 + CAT_XDEFINE n, m %+ %%n2, %%n2 1.753 +%endif 1.754 + %undef tmp 1.755 + %rotate 1 1.756 +%endrep 1.757 +%endmacro 1.758 + 1.759 +; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later 1.760 +; calls to that function will automatically load the permutation, so values can 1.761 +; be returned in mmregs. 1.762 +%macro SAVE_MM_PERMUTATION 0-1 1.763 + %if %0 1.764 + %xdefine %%f %1_m 1.765 + %else 1.766 + %xdefine %%f current_function %+ _m 1.767 + %endif 1.768 + %assign %%i 0 1.769 + %rep num_mmregs 1.770 + CAT_XDEFINE %%f, %%i, m %+ %%i 1.771 + %assign %%i %%i+1 1.772 + %endrep 1.773 +%endmacro 1.774 + 1.775 +%macro LOAD_MM_PERMUTATION 1 ; name to load from 1.776 + %ifdef %1_m0 1.777 + %assign %%i 0 1.778 + %rep num_mmregs 1.779 + CAT_XDEFINE m, %%i, %1_m %+ %%i 1.780 + CAT_XDEFINE n, m %+ %%i, %%i 1.781 + %assign %%i %%i+1 1.782 + %endrep 1.783 + %endif 1.784 +%endmacro 1.785 + 1.786 +; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't 1.787 +%macro call 1 1.788 + call_internal %1, %1 %+ SUFFIX 1.789 +%endmacro 1.790 +%macro call_internal 2 1.791 + %xdefine %%i %1 1.792 + %ifndef cglobaled_%1 1.793 + %ifdef cglobaled_%2 1.794 + %xdefine %%i %2 1.795 + %endif 1.796 + %endif 1.797 + call %%i 1.798 + LOAD_MM_PERMUTATION %%i 1.799 +%endmacro 1.800 + 1.801 +; Substitutions that reduce instruction size but are functionally equivalent 1.802 +%macro add 2 1.803 + %ifnum %2 1.804 + %if %2==128 1.805 + sub %1, -128 1.806 + %else 1.807 + add %1, %2 1.808 + %endif 1.809 + %else 1.810 + add %1, %2 1.811 + %endif 1.812 +%endmacro 1.813 + 1.814 +%macro sub 2 1.815 + %ifnum %2 1.816 + %if %2==128 1.817 + add %1, -128 1.818 + %else 1.819 + sub %1, %2 1.820 + %endif 1.821 + %else 1.822 + sub %1, %2 1.823 + %endif 1.824 +%endmacro 1.825 + 1.826 +;============================================================================= 1.827 +; AVX abstraction layer 1.828 +;============================================================================= 1.829 + 1.830 +%assign i 0 1.831 +%rep 16 1.832 + %if i < 8 1.833 + CAT_XDEFINE sizeofmm, i, 8 1.834 + %endif 1.835 + CAT_XDEFINE sizeofxmm, i, 16 1.836 + CAT_XDEFINE sizeofymm, i, 32 1.837 +%assign i i+1 1.838 +%endrep 1.839 +%undef i 1.840 + 1.841 +%macro CHECK_AVX_INSTR_EMU 3-* 1.842 + %xdefine %%opcode %1 1.843 + %xdefine %%dst %2 1.844 + %rep %0-2 1.845 + %ifidn %%dst, %3 1.846 + %error non-AVX emulation of ``%%opcode'' is not supported 1.847 + %endif 1.848 + %rotate 1 1.849 + %endrep 1.850 +%endmacro 1.851 + 1.852 +;%1 == instruction 1.853 +;%2 == 1 if float, 0 if int 1.854 +;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) 1.855 +;%4 == number of operands given 1.856 +;%5+: operands 1.857 +%macro RUN_AVX_INSTR 6-7+ 1.858 + %ifid %6 1.859 + %define %%sizeofreg sizeof%6 1.860 + %elifid %5 1.861 + %define %%sizeofreg sizeof%5 1.862 + %else 1.863 + %define %%sizeofreg mmsize 1.864 + %endif 1.865 + %if %%sizeofreg==32 1.866 + %if %4>=3 1.867 + v%1 %5, %6, %7 1.868 + %else 1.869 + v%1 %5, %6 1.870 + %endif 1.871 + %else 1.872 + %if %%sizeofreg==8 1.873 + %define %%regmov movq 1.874 + %elif %2 1.875 + %define %%regmov movaps 1.876 + %else 1.877 + %define %%regmov movdqa 1.878 + %endif 1.879 + 1.880 + %if %4>=3+%3 1.881 + %ifnidn %5, %6 1.882 + %if AVX_enabled && %%sizeofreg==16 1.883 + v%1 %5, %6, %7 1.884 + %else 1.885 + CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7 1.886 + %%regmov %5, %6 1.887 + %1 %5, %7 1.888 + %endif 1.889 + %else 1.890 + %1 %5, %7 1.891 + %endif 1.892 + %elif %4>=3 1.893 + %1 %5, %6, %7 1.894 + %else 1.895 + %1 %5, %6 1.896 + %endif 1.897 + %endif 1.898 +%endmacro 1.899 + 1.900 +; 3arg AVX ops with a memory arg can only have it in src2, 1.901 +; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov). 1.902 +; So, if the op is symmetric and the wrong one is memory, swap them. 1.903 +%macro RUN_AVX_INSTR1 8 1.904 + %assign %%swap 0 1.905 + %if AVX_enabled 1.906 + %ifnid %6 1.907 + %assign %%swap 1 1.908 + %endif 1.909 + %elifnidn %5, %6 1.910 + %ifnid %7 1.911 + %assign %%swap 1 1.912 + %endif 1.913 + %endif 1.914 + %if %%swap && %3 == 0 && %8 == 1 1.915 + RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6 1.916 + %else 1.917 + RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7 1.918 + %endif 1.919 +%endmacro 1.920 + 1.921 +;%1 == instruction 1.922 +;%2 == 1 if float, 0 if int 1.923 +;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) 1.924 +;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not 1.925 +%macro AVX_INSTR 4 1.926 + %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 1.927 + %ifidn %3, fnord 1.928 + RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 1.929 + %elifidn %4, fnord 1.930 + RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 1.931 + %elifidn %5, fnord 1.932 + RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 1.933 + %else 1.934 + RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 1.935 + %endif 1.936 + %endmacro 1.937 +%endmacro 1.938 + 1.939 +AVX_INSTR addpd, 1, 0, 1 1.940 +AVX_INSTR addps, 1, 0, 1 1.941 +AVX_INSTR addsd, 1, 0, 1 1.942 +AVX_INSTR addss, 1, 0, 1 1.943 +AVX_INSTR addsubpd, 1, 0, 0 1.944 +AVX_INSTR addsubps, 1, 0, 0 1.945 +AVX_INSTR andpd, 1, 0, 1 1.946 +AVX_INSTR andps, 1, 0, 1 1.947 +AVX_INSTR andnpd, 1, 0, 0 1.948 +AVX_INSTR andnps, 1, 0, 0 1.949 +AVX_INSTR blendpd, 1, 0, 0 1.950 +AVX_INSTR blendps, 1, 0, 0 1.951 +AVX_INSTR blendvpd, 1, 0, 0 1.952 +AVX_INSTR blendvps, 1, 0, 0 1.953 +AVX_INSTR cmppd, 1, 0, 0 1.954 +AVX_INSTR cmpps, 1, 0, 0 1.955 +AVX_INSTR cmpsd, 1, 0, 0 1.956 +AVX_INSTR cmpss, 1, 0, 0 1.957 +AVX_INSTR cvtdq2ps, 1, 0, 0 1.958 +AVX_INSTR cvtps2dq, 1, 0, 0 1.959 +AVX_INSTR divpd, 1, 0, 0 1.960 +AVX_INSTR divps, 1, 0, 0 1.961 +AVX_INSTR divsd, 1, 0, 0 1.962 +AVX_INSTR divss, 1, 0, 0 1.963 +AVX_INSTR dppd, 1, 1, 0 1.964 +AVX_INSTR dpps, 1, 1, 0 1.965 +AVX_INSTR haddpd, 1, 0, 0 1.966 +AVX_INSTR haddps, 1, 0, 0 1.967 +AVX_INSTR hsubpd, 1, 0, 0 1.968 +AVX_INSTR hsubps, 1, 0, 0 1.969 +AVX_INSTR maxpd, 1, 0, 1 1.970 +AVX_INSTR maxps, 1, 0, 1 1.971 +AVX_INSTR maxsd, 1, 0, 1 1.972 +AVX_INSTR maxss, 1, 0, 1 1.973 +AVX_INSTR minpd, 1, 0, 1 1.974 +AVX_INSTR minps, 1, 0, 1 1.975 +AVX_INSTR minsd, 1, 0, 1 1.976 +AVX_INSTR minss, 1, 0, 1 1.977 +AVX_INSTR movhlps, 1, 0, 0 1.978 +AVX_INSTR movlhps, 1, 0, 0 1.979 +AVX_INSTR movsd, 1, 0, 0 1.980 +AVX_INSTR movss, 1, 0, 0 1.981 +AVX_INSTR mpsadbw, 0, 1, 0 1.982 +AVX_INSTR mulpd, 1, 0, 1 1.983 +AVX_INSTR mulps, 1, 0, 1 1.984 +AVX_INSTR mulsd, 1, 0, 1 1.985 +AVX_INSTR mulss, 1, 0, 1 1.986 +AVX_INSTR orpd, 1, 0, 1 1.987 +AVX_INSTR orps, 1, 0, 1 1.988 +AVX_INSTR pabsb, 0, 0, 0 1.989 +AVX_INSTR pabsw, 0, 0, 0 1.990 +AVX_INSTR pabsd, 0, 0, 0 1.991 +AVX_INSTR packsswb, 0, 0, 0 1.992 +AVX_INSTR packssdw, 0, 0, 0 1.993 +AVX_INSTR packuswb, 0, 0, 0 1.994 +AVX_INSTR packusdw, 0, 0, 0 1.995 +AVX_INSTR paddb, 0, 0, 1 1.996 +AVX_INSTR paddw, 0, 0, 1 1.997 +AVX_INSTR paddd, 0, 0, 1 1.998 +AVX_INSTR paddq, 0, 0, 1 1.999 +AVX_INSTR paddsb, 0, 0, 1 1.1000 +AVX_INSTR paddsw, 0, 0, 1 1.1001 +AVX_INSTR paddusb, 0, 0, 1 1.1002 +AVX_INSTR paddusw, 0, 0, 1 1.1003 +AVX_INSTR palignr, 0, 1, 0 1.1004 +AVX_INSTR pand, 0, 0, 1 1.1005 +AVX_INSTR pandn, 0, 0, 0 1.1006 +AVX_INSTR pavgb, 0, 0, 1 1.1007 +AVX_INSTR pavgw, 0, 0, 1 1.1008 +AVX_INSTR pblendvb, 0, 0, 0 1.1009 +AVX_INSTR pblendw, 0, 1, 0 1.1010 +AVX_INSTR pcmpestri, 0, 0, 0 1.1011 +AVX_INSTR pcmpestrm, 0, 0, 0 1.1012 +AVX_INSTR pcmpistri, 0, 0, 0 1.1013 +AVX_INSTR pcmpistrm, 0, 0, 0 1.1014 +AVX_INSTR pcmpeqb, 0, 0, 1 1.1015 +AVX_INSTR pcmpeqw, 0, 0, 1 1.1016 +AVX_INSTR pcmpeqd, 0, 0, 1 1.1017 +AVX_INSTR pcmpeqq, 0, 0, 1 1.1018 +AVX_INSTR pcmpgtb, 0, 0, 0 1.1019 +AVX_INSTR pcmpgtw, 0, 0, 0 1.1020 +AVX_INSTR pcmpgtd, 0, 0, 0 1.1021 +AVX_INSTR pcmpgtq, 0, 0, 0 1.1022 +AVX_INSTR phaddw, 0, 0, 0 1.1023 +AVX_INSTR phaddd, 0, 0, 0 1.1024 +AVX_INSTR phaddsw, 0, 0, 0 1.1025 +AVX_INSTR phsubw, 0, 0, 0 1.1026 +AVX_INSTR phsubd, 0, 0, 0 1.1027 +AVX_INSTR phsubsw, 0, 0, 0 1.1028 +AVX_INSTR pmaddwd, 0, 0, 1 1.1029 +AVX_INSTR pmaddubsw, 0, 0, 0 1.1030 +AVX_INSTR pmaxsb, 0, 0, 1 1.1031 +AVX_INSTR pmaxsw, 0, 0, 1 1.1032 +AVX_INSTR pmaxsd, 0, 0, 1 1.1033 +AVX_INSTR pmaxub, 0, 0, 1 1.1034 +AVX_INSTR pmaxuw, 0, 0, 1 1.1035 +AVX_INSTR pmaxud, 0, 0, 1 1.1036 +AVX_INSTR pminsb, 0, 0, 1 1.1037 +AVX_INSTR pminsw, 0, 0, 1 1.1038 +AVX_INSTR pminsd, 0, 0, 1 1.1039 +AVX_INSTR pminub, 0, 0, 1 1.1040 +AVX_INSTR pminuw, 0, 0, 1 1.1041 +AVX_INSTR pminud, 0, 0, 1 1.1042 +AVX_INSTR pmovmskb, 0, 0, 0 1.1043 +AVX_INSTR pmulhuw, 0, 0, 1 1.1044 +AVX_INSTR pmulhrsw, 0, 0, 1 1.1045 +AVX_INSTR pmulhw, 0, 0, 1 1.1046 +AVX_INSTR pmullw, 0, 0, 1 1.1047 +AVX_INSTR pmulld, 0, 0, 1 1.1048 +AVX_INSTR pmuludq, 0, 0, 1 1.1049 +AVX_INSTR pmuldq, 0, 0, 1 1.1050 +AVX_INSTR por, 0, 0, 1 1.1051 +AVX_INSTR psadbw, 0, 0, 1 1.1052 +AVX_INSTR pshufb, 0, 0, 0 1.1053 +AVX_INSTR pshufd, 0, 1, 0 1.1054 +AVX_INSTR pshufhw, 0, 1, 0 1.1055 +AVX_INSTR pshuflw, 0, 1, 0 1.1056 +AVX_INSTR psignb, 0, 0, 0 1.1057 +AVX_INSTR psignw, 0, 0, 0 1.1058 +AVX_INSTR psignd, 0, 0, 0 1.1059 +AVX_INSTR psllw, 0, 0, 0 1.1060 +AVX_INSTR pslld, 0, 0, 0 1.1061 +AVX_INSTR psllq, 0, 0, 0 1.1062 +AVX_INSTR pslldq, 0, 0, 0 1.1063 +AVX_INSTR psraw, 0, 0, 0 1.1064 +AVX_INSTR psrad, 0, 0, 0 1.1065 +AVX_INSTR psrlw, 0, 0, 0 1.1066 +AVX_INSTR psrld, 0, 0, 0 1.1067 +AVX_INSTR psrlq, 0, 0, 0 1.1068 +AVX_INSTR psrldq, 0, 0, 0 1.1069 +AVX_INSTR psubb, 0, 0, 0 1.1070 +AVX_INSTR psubw, 0, 0, 0 1.1071 +AVX_INSTR psubd, 0, 0, 0 1.1072 +AVX_INSTR psubq, 0, 0, 0 1.1073 +AVX_INSTR psubsb, 0, 0, 0 1.1074 +AVX_INSTR psubsw, 0, 0, 0 1.1075 +AVX_INSTR psubusb, 0, 0, 0 1.1076 +AVX_INSTR psubusw, 0, 0, 0 1.1077 +AVX_INSTR ptest, 0, 0, 0 1.1078 +AVX_INSTR punpckhbw, 0, 0, 0 1.1079 +AVX_INSTR punpckhwd, 0, 0, 0 1.1080 +AVX_INSTR punpckhdq, 0, 0, 0 1.1081 +AVX_INSTR punpckhqdq, 0, 0, 0 1.1082 +AVX_INSTR punpcklbw, 0, 0, 0 1.1083 +AVX_INSTR punpcklwd, 0, 0, 0 1.1084 +AVX_INSTR punpckldq, 0, 0, 0 1.1085 +AVX_INSTR punpcklqdq, 0, 0, 0 1.1086 +AVX_INSTR pxor, 0, 0, 1 1.1087 +AVX_INSTR shufps, 1, 1, 0 1.1088 +AVX_INSTR subpd, 1, 0, 0 1.1089 +AVX_INSTR subps, 1, 0, 0 1.1090 +AVX_INSTR subsd, 1, 0, 0 1.1091 +AVX_INSTR subss, 1, 0, 0 1.1092 +AVX_INSTR unpckhpd, 1, 0, 0 1.1093 +AVX_INSTR unpckhps, 1, 0, 0 1.1094 +AVX_INSTR unpcklpd, 1, 0, 0 1.1095 +AVX_INSTR unpcklps, 1, 0, 0 1.1096 +AVX_INSTR xorpd, 1, 0, 1 1.1097 +AVX_INSTR xorps, 1, 0, 1 1.1098 + 1.1099 +; 3DNow instructions, for sharing code between AVX, SSE and 3DN 1.1100 +AVX_INSTR pfadd, 1, 0, 1 1.1101 +AVX_INSTR pfsub, 1, 0, 0 1.1102 +AVX_INSTR pfmul, 1, 0, 1 1.1103 + 1.1104 +; base-4 constants for shuffles 1.1105 +%assign i 0 1.1106 +%rep 256 1.1107 + %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) 1.1108 + %if j < 10 1.1109 + CAT_XDEFINE q000, j, i 1.1110 + %elif j < 100 1.1111 + CAT_XDEFINE q00, j, i 1.1112 + %elif j < 1000 1.1113 + CAT_XDEFINE q0, j, i 1.1114 + %else 1.1115 + CAT_XDEFINE q, j, i 1.1116 + %endif 1.1117 +%assign i i+1 1.1118 +%endrep 1.1119 +%undef i 1.1120 +%undef j 1.1121 + 1.1122 +%macro FMA_INSTR 3 1.1123 + %macro %1 4-7 %1, %2, %3 1.1124 + %if cpuflag(xop) 1.1125 + v%5 %1, %2, %3, %4 1.1126 + %else 1.1127 + %6 %1, %2, %3 1.1128 + %7 %1, %4 1.1129 + %endif 1.1130 + %endmacro 1.1131 +%endmacro 1.1132 + 1.1133 +FMA_INSTR pmacsdd, pmulld, paddd 1.1134 +FMA_INSTR pmacsww, pmullw, paddw 1.1135 +FMA_INSTR pmadcswd, pmaddwd, paddd 1.1136 + 1.1137 +; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. 1.1138 +; This lets us use tzcnt without bumping the yasm version requirement yet. 1.1139 +%define tzcnt rep bsf