Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | ;***************************************************************************** |
michael@0 | 2 | ;* x86inc.asm: x264asm abstraction layer |
michael@0 | 3 | ;***************************************************************************** |
michael@0 | 4 | ;* Copyright (C) 2005-2012 x264 project |
michael@0 | 5 | ;* |
michael@0 | 6 | ;* Authors: Loren Merritt <lorenm@u.washington.edu> |
michael@0 | 7 | ;* Anton Mitrofanov <BugMaster@narod.ru> |
michael@0 | 8 | ;* Jason Garrett-Glaser <darkshikari@gmail.com> |
michael@0 | 9 | ;* Henrik Gramner <hengar-6@student.ltu.se> |
michael@0 | 10 | ;* |
michael@0 | 11 | ;* Permission to use, copy, modify, and/or distribute this software for any |
michael@0 | 12 | ;* purpose with or without fee is hereby granted, provided that the above |
michael@0 | 13 | ;* copyright notice and this permission notice appear in all copies. |
michael@0 | 14 | ;* |
michael@0 | 15 | ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
michael@0 | 16 | ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
michael@0 | 17 | ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
michael@0 | 18 | ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
michael@0 | 19 | ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
michael@0 | 20 | ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
michael@0 | 21 | ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
michael@0 | 22 | ;***************************************************************************** |
michael@0 | 23 | |
michael@0 | 24 | ; This is a header file for the x264ASM assembly language, which uses |
michael@0 | 25 | ; NASM/YASM syntax combined with a large number of macros to provide easy |
michael@0 | 26 | ; abstraction between different calling conventions (x86_32, win64, linux64). |
michael@0 | 27 | ; It also has various other useful features to simplify writing the kind of |
michael@0 | 28 | ; DSP functions that are most often used in x264. |
michael@0 | 29 | |
michael@0 | 30 | ; Unlike the rest of x264, this file is available under an ISC license, as it |
michael@0 | 31 | ; has significant usefulness outside of x264 and we want it to be available |
michael@0 | 32 | ; to the largest audience possible. Of course, if you modify it for your own |
michael@0 | 33 | ; purposes to add a new feature, we strongly encourage contributing a patch |
michael@0 | 34 | ; as this feature might be useful for others as well. Send patches or ideas |
michael@0 | 35 | ; to x264-devel@videolan.org . |
michael@0 | 36 | |
michael@0 | 37 | ; Local changes for libyuv: |
michael@0 | 38 | ; remove %define program_name and references in labels |
michael@0 | 39 | ; rename cpus to uppercase |
michael@0 | 40 | |
michael@0 | 41 | %define WIN64 0 |
michael@0 | 42 | %define UNIX64 0 |
michael@0 | 43 | %if ARCH_X86_64 |
michael@0 | 44 | %ifidn __OUTPUT_FORMAT__,win32 |
michael@0 | 45 | %define WIN64 1 |
michael@0 | 46 | %elifidn __OUTPUT_FORMAT__,win64 |
michael@0 | 47 | %define WIN64 1 |
michael@0 | 48 | %else |
michael@0 | 49 | %define UNIX64 1 |
michael@0 | 50 | %endif |
michael@0 | 51 | %endif |
michael@0 | 52 | |
michael@0 | 53 | %ifdef PREFIX |
michael@0 | 54 | %define mangle(x) _ %+ x |
michael@0 | 55 | %else |
michael@0 | 56 | %define mangle(x) x |
michael@0 | 57 | %endif |
michael@0 | 58 | |
michael@0 | 59 | ; Name of the .rodata section. |
michael@0 | 60 | ; Kludge: Something on OS X fails to align .rodata even given an align attribute, |
michael@0 | 61 | ; so use a different read-only section. |
michael@0 | 62 | %macro SECTION_RODATA 0-1 16 |
michael@0 | 63 | %ifidn __OUTPUT_FORMAT__,macho64 |
michael@0 | 64 | SECTION .text align=%1 |
michael@0 | 65 | %elifidn __OUTPUT_FORMAT__,macho |
michael@0 | 66 | SECTION .text align=%1 |
michael@0 | 67 | fakegot: |
michael@0 | 68 | %elifidn __OUTPUT_FORMAT__,aout |
michael@0 | 69 | section .text |
michael@0 | 70 | %else |
michael@0 | 71 | SECTION .rodata align=%1 |
michael@0 | 72 | %endif |
michael@0 | 73 | %endmacro |
michael@0 | 74 | |
michael@0 | 75 | ; aout does not support align= |
michael@0 | 76 | %macro SECTION_TEXT 0-1 16 |
michael@0 | 77 | %ifidn __OUTPUT_FORMAT__,aout |
michael@0 | 78 | SECTION .text |
michael@0 | 79 | %else |
michael@0 | 80 | SECTION .text align=%1 |
michael@0 | 81 | %endif |
michael@0 | 82 | %endmacro |
michael@0 | 83 | |
michael@0 | 84 | %if WIN64 |
michael@0 | 85 | %define PIC |
michael@0 | 86 | %elif ARCH_X86_64 == 0 |
michael@0 | 87 | ; x86_32 doesn't require PIC. |
michael@0 | 88 | ; Some distros prefer shared objects to be PIC, but nothing breaks if |
michael@0 | 89 | ; the code contains a few textrels, so we'll skip that complexity. |
michael@0 | 90 | %undef PIC |
michael@0 | 91 | %endif |
michael@0 | 92 | %ifdef PIC |
michael@0 | 93 | default rel |
michael@0 | 94 | %endif |
michael@0 | 95 | |
michael@0 | 96 | ; Always use long nops (reduces 0x90 spam in disassembly on x86_32) |
michael@0 | 97 | CPU amdnop |
michael@0 | 98 | |
michael@0 | 99 | ; Macros to eliminate most code duplication between x86_32 and x86_64: |
michael@0 | 100 | ; Currently this works only for leaf functions which load all their arguments |
michael@0 | 101 | ; into registers at the start, and make no other use of the stack. Luckily that |
michael@0 | 102 | ; covers most of x264's asm. |
michael@0 | 103 | |
michael@0 | 104 | ; PROLOGUE: |
michael@0 | 105 | ; %1 = number of arguments. loads them from stack if needed. |
michael@0 | 106 | ; %2 = number of registers used. pushes callee-saved regs if needed. |
michael@0 | 107 | ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. |
michael@0 | 108 | ; %4 = list of names to define to registers |
michael@0 | 109 | ; PROLOGUE can also be invoked by adding the same options to cglobal |
michael@0 | 110 | |
michael@0 | 111 | ; e.g. |
michael@0 | 112 | ; cglobal foo, 2,3,0, dst, src, tmp |
michael@0 | 113 | ; declares a function (foo), taking two args (dst and src) and one local variable (tmp) |
michael@0 | 114 | |
michael@0 | 115 | ; TODO Some functions can use some args directly from the stack. If they're the |
michael@0 | 116 | ; last args then you can just not declare them, but if they're in the middle |
michael@0 | 117 | ; we need more flexible macro. |
michael@0 | 118 | |
michael@0 | 119 | ; RET: |
michael@0 | 120 | ; Pops anything that was pushed by PROLOGUE, and returns. |
michael@0 | 121 | |
michael@0 | 122 | ; REP_RET: |
michael@0 | 123 | ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons |
michael@0 | 124 | ; which are slow when a normal ret follows a branch. |
michael@0 | 125 | |
michael@0 | 126 | ; registers: |
michael@0 | 127 | ; rN and rNq are the native-size register holding function argument N |
michael@0 | 128 | ; rNd, rNw, rNb are dword, word, and byte size |
michael@0 | 129 | ; rNh is the high 8 bits of the word size |
michael@0 | 130 | ; rNm is the original location of arg N (a register or on the stack), dword |
michael@0 | 131 | ; rNmp is native size |
michael@0 | 132 | |
michael@0 | 133 | %macro DECLARE_REG 2-3 |
michael@0 | 134 | %define r%1q %2 |
michael@0 | 135 | %define r%1d %2d |
michael@0 | 136 | %define r%1w %2w |
michael@0 | 137 | %define r%1b %2b |
michael@0 | 138 | %define r%1h %2h |
michael@0 | 139 | %if %0 == 2 |
michael@0 | 140 | %define r%1m %2d |
michael@0 | 141 | %define r%1mp %2 |
michael@0 | 142 | %elif ARCH_X86_64 ; memory |
michael@0 | 143 | %define r%1m [rsp + stack_offset + %3] |
michael@0 | 144 | %define r%1mp qword r %+ %1m |
michael@0 | 145 | %else |
michael@0 | 146 | %define r%1m [esp + stack_offset + %3] |
michael@0 | 147 | %define r%1mp dword r %+ %1m |
michael@0 | 148 | %endif |
michael@0 | 149 | %define r%1 %2 |
michael@0 | 150 | %endmacro |
michael@0 | 151 | |
michael@0 | 152 | %macro DECLARE_REG_SIZE 3 |
michael@0 | 153 | %define r%1q r%1 |
michael@0 | 154 | %define e%1q r%1 |
michael@0 | 155 | %define r%1d e%1 |
michael@0 | 156 | %define e%1d e%1 |
michael@0 | 157 | %define r%1w %1 |
michael@0 | 158 | %define e%1w %1 |
michael@0 | 159 | %define r%1h %3 |
michael@0 | 160 | %define e%1h %3 |
michael@0 | 161 | %define r%1b %2 |
michael@0 | 162 | %define e%1b %2 |
michael@0 | 163 | %if ARCH_X86_64 == 0 |
michael@0 | 164 | %define r%1 e%1 |
michael@0 | 165 | %endif |
michael@0 | 166 | %endmacro |
michael@0 | 167 | |
michael@0 | 168 | DECLARE_REG_SIZE ax, al, ah |
michael@0 | 169 | DECLARE_REG_SIZE bx, bl, bh |
michael@0 | 170 | DECLARE_REG_SIZE cx, cl, ch |
michael@0 | 171 | DECLARE_REG_SIZE dx, dl, dh |
michael@0 | 172 | DECLARE_REG_SIZE si, sil, null |
michael@0 | 173 | DECLARE_REG_SIZE di, dil, null |
michael@0 | 174 | DECLARE_REG_SIZE bp, bpl, null |
michael@0 | 175 | |
michael@0 | 176 | ; t# defines for when per-arch register allocation is more complex than just function arguments |
michael@0 | 177 | |
michael@0 | 178 | %macro DECLARE_REG_TMP 1-* |
michael@0 | 179 | %assign %%i 0 |
michael@0 | 180 | %rep %0 |
michael@0 | 181 | CAT_XDEFINE t, %%i, r%1 |
michael@0 | 182 | %assign %%i %%i+1 |
michael@0 | 183 | %rotate 1 |
michael@0 | 184 | %endrep |
michael@0 | 185 | %endmacro |
michael@0 | 186 | |
michael@0 | 187 | %macro DECLARE_REG_TMP_SIZE 0-* |
michael@0 | 188 | %rep %0 |
michael@0 | 189 | %define t%1q t%1 %+ q |
michael@0 | 190 | %define t%1d t%1 %+ d |
michael@0 | 191 | %define t%1w t%1 %+ w |
michael@0 | 192 | %define t%1h t%1 %+ h |
michael@0 | 193 | %define t%1b t%1 %+ b |
michael@0 | 194 | %rotate 1 |
michael@0 | 195 | %endrep |
michael@0 | 196 | %endmacro |
michael@0 | 197 | |
michael@0 | 198 | DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 |
michael@0 | 199 | |
michael@0 | 200 | %if ARCH_X86_64 |
michael@0 | 201 | %define gprsize 8 |
michael@0 | 202 | %else |
michael@0 | 203 | %define gprsize 4 |
michael@0 | 204 | %endif |
michael@0 | 205 | |
michael@0 | 206 | %macro PUSH 1 |
michael@0 | 207 | push %1 |
michael@0 | 208 | %assign stack_offset stack_offset+gprsize |
michael@0 | 209 | %endmacro |
michael@0 | 210 | |
michael@0 | 211 | %macro POP 1 |
michael@0 | 212 | pop %1 |
michael@0 | 213 | %assign stack_offset stack_offset-gprsize |
michael@0 | 214 | %endmacro |
michael@0 | 215 | |
michael@0 | 216 | %macro PUSH_IF_USED 1-* |
michael@0 | 217 | %rep %0 |
michael@0 | 218 | %if %1 < regs_used |
michael@0 | 219 | PUSH r%1 |
michael@0 | 220 | %endif |
michael@0 | 221 | %rotate 1 |
michael@0 | 222 | %endrep |
michael@0 | 223 | %endmacro |
michael@0 | 224 | |
michael@0 | 225 | %macro POP_IF_USED 1-* |
michael@0 | 226 | %rep %0 |
michael@0 | 227 | %if %1 < regs_used |
michael@0 | 228 | pop r%1 |
michael@0 | 229 | %endif |
michael@0 | 230 | %rotate 1 |
michael@0 | 231 | %endrep |
michael@0 | 232 | %endmacro |
michael@0 | 233 | |
michael@0 | 234 | %macro LOAD_IF_USED 1-* |
michael@0 | 235 | %rep %0 |
michael@0 | 236 | %if %1 < num_args |
michael@0 | 237 | mov r%1, r %+ %1 %+ mp |
michael@0 | 238 | %endif |
michael@0 | 239 | %rotate 1 |
michael@0 | 240 | %endrep |
michael@0 | 241 | %endmacro |
michael@0 | 242 | |
michael@0 | 243 | %macro SUB 2 |
michael@0 | 244 | sub %1, %2 |
michael@0 | 245 | %ifidn %1, rsp |
michael@0 | 246 | %assign stack_offset stack_offset+(%2) |
michael@0 | 247 | %endif |
michael@0 | 248 | %endmacro |
michael@0 | 249 | |
michael@0 | 250 | %macro ADD 2 |
michael@0 | 251 | add %1, %2 |
michael@0 | 252 | %ifidn %1, rsp |
michael@0 | 253 | %assign stack_offset stack_offset-(%2) |
michael@0 | 254 | %endif |
michael@0 | 255 | %endmacro |
michael@0 | 256 | |
michael@0 | 257 | %macro movifnidn 2 |
michael@0 | 258 | %ifnidn %1, %2 |
michael@0 | 259 | mov %1, %2 |
michael@0 | 260 | %endif |
michael@0 | 261 | %endmacro |
michael@0 | 262 | |
michael@0 | 263 | %macro movsxdifnidn 2 |
michael@0 | 264 | %ifnidn %1, %2 |
michael@0 | 265 | movsxd %1, %2 |
michael@0 | 266 | %endif |
michael@0 | 267 | %endmacro |
michael@0 | 268 | |
michael@0 | 269 | %macro ASSERT 1 |
michael@0 | 270 | %if (%1) == 0 |
michael@0 | 271 | %error assert failed |
michael@0 | 272 | %endif |
michael@0 | 273 | %endmacro |
michael@0 | 274 | |
michael@0 | 275 | %macro DEFINE_ARGS 0-* |
michael@0 | 276 | %ifdef n_arg_names |
michael@0 | 277 | %assign %%i 0 |
michael@0 | 278 | %rep n_arg_names |
michael@0 | 279 | CAT_UNDEF arg_name %+ %%i, q |
michael@0 | 280 | CAT_UNDEF arg_name %+ %%i, d |
michael@0 | 281 | CAT_UNDEF arg_name %+ %%i, w |
michael@0 | 282 | CAT_UNDEF arg_name %+ %%i, h |
michael@0 | 283 | CAT_UNDEF arg_name %+ %%i, b |
michael@0 | 284 | CAT_UNDEF arg_name %+ %%i, m |
michael@0 | 285 | CAT_UNDEF arg_name %+ %%i, mp |
michael@0 | 286 | CAT_UNDEF arg_name, %%i |
michael@0 | 287 | %assign %%i %%i+1 |
michael@0 | 288 | %endrep |
michael@0 | 289 | %endif |
michael@0 | 290 | |
michael@0 | 291 | %xdefine %%stack_offset stack_offset |
michael@0 | 292 | %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine |
michael@0 | 293 | %assign %%i 0 |
michael@0 | 294 | %rep %0 |
michael@0 | 295 | %xdefine %1q r %+ %%i %+ q |
michael@0 | 296 | %xdefine %1d r %+ %%i %+ d |
michael@0 | 297 | %xdefine %1w r %+ %%i %+ w |
michael@0 | 298 | %xdefine %1h r %+ %%i %+ h |
michael@0 | 299 | %xdefine %1b r %+ %%i %+ b |
michael@0 | 300 | %xdefine %1m r %+ %%i %+ m |
michael@0 | 301 | %xdefine %1mp r %+ %%i %+ mp |
michael@0 | 302 | CAT_XDEFINE arg_name, %%i, %1 |
michael@0 | 303 | %assign %%i %%i+1 |
michael@0 | 304 | %rotate 1 |
michael@0 | 305 | %endrep |
michael@0 | 306 | %xdefine stack_offset %%stack_offset |
michael@0 | 307 | %assign n_arg_names %0 |
michael@0 | 308 | %endmacro |
michael@0 | 309 | |
michael@0 | 310 | %if WIN64 ; Windows x64 ;================================================= |
michael@0 | 311 | |
michael@0 | 312 | DECLARE_REG 0, rcx |
michael@0 | 313 | DECLARE_REG 1, rdx |
michael@0 | 314 | DECLARE_REG 2, R8 |
michael@0 | 315 | DECLARE_REG 3, R9 |
michael@0 | 316 | DECLARE_REG 4, R10, 40 |
michael@0 | 317 | DECLARE_REG 5, R11, 48 |
michael@0 | 318 | DECLARE_REG 6, rax, 56 |
michael@0 | 319 | DECLARE_REG 7, rdi, 64 |
michael@0 | 320 | DECLARE_REG 8, rsi, 72 |
michael@0 | 321 | DECLARE_REG 9, rbx, 80 |
michael@0 | 322 | DECLARE_REG 10, rbp, 88 |
michael@0 | 323 | DECLARE_REG 11, R12, 96 |
michael@0 | 324 | DECLARE_REG 12, R13, 104 |
michael@0 | 325 | DECLARE_REG 13, R14, 112 |
michael@0 | 326 | DECLARE_REG 14, R15, 120 |
michael@0 | 327 | |
michael@0 | 328 | %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... |
michael@0 | 329 | %assign num_args %1 |
michael@0 | 330 | %assign regs_used %2 |
michael@0 | 331 | ASSERT regs_used >= num_args |
michael@0 | 332 | ASSERT regs_used <= 15 |
michael@0 | 333 | PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 |
michael@0 | 334 | %if mmsize == 8 |
michael@0 | 335 | %assign xmm_regs_used 0 |
michael@0 | 336 | %else |
michael@0 | 337 | WIN64_SPILL_XMM %3 |
michael@0 | 338 | %endif |
michael@0 | 339 | LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 |
michael@0 | 340 | DEFINE_ARGS %4 |
michael@0 | 341 | %endmacro |
michael@0 | 342 | |
michael@0 | 343 | %macro WIN64_SPILL_XMM 1 |
michael@0 | 344 | %assign xmm_regs_used %1 |
michael@0 | 345 | ASSERT xmm_regs_used <= 16 |
michael@0 | 346 | %if xmm_regs_used > 6 |
michael@0 | 347 | SUB rsp, (xmm_regs_used-6)*16+16 |
michael@0 | 348 | %assign %%i xmm_regs_used |
michael@0 | 349 | %rep (xmm_regs_used-6) |
michael@0 | 350 | %assign %%i %%i-1 |
michael@0 | 351 | movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i |
michael@0 | 352 | %endrep |
michael@0 | 353 | %endif |
michael@0 | 354 | %endmacro |
michael@0 | 355 | |
michael@0 | 356 | %macro WIN64_RESTORE_XMM_INTERNAL 1 |
michael@0 | 357 | %if xmm_regs_used > 6 |
michael@0 | 358 | %assign %%i xmm_regs_used |
michael@0 | 359 | %rep (xmm_regs_used-6) |
michael@0 | 360 | %assign %%i %%i-1 |
michael@0 | 361 | movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] |
michael@0 | 362 | %endrep |
michael@0 | 363 | add %1, (xmm_regs_used-6)*16+16 |
michael@0 | 364 | %endif |
michael@0 | 365 | %endmacro |
michael@0 | 366 | |
michael@0 | 367 | %macro WIN64_RESTORE_XMM 1 |
michael@0 | 368 | WIN64_RESTORE_XMM_INTERNAL %1 |
michael@0 | 369 | %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 |
michael@0 | 370 | %assign xmm_regs_used 0 |
michael@0 | 371 | %endmacro |
michael@0 | 372 | |
michael@0 | 373 | %define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 |
michael@0 | 374 | |
michael@0 | 375 | %macro RET 0 |
michael@0 | 376 | WIN64_RESTORE_XMM_INTERNAL rsp |
michael@0 | 377 | POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 |
michael@0 | 378 | %if mmsize == 32 |
michael@0 | 379 | vzeroupper |
michael@0 | 380 | %endif |
michael@0 | 381 | ret |
michael@0 | 382 | %endmacro |
michael@0 | 383 | |
michael@0 | 384 | %elif ARCH_X86_64 ; *nix x64 ;============================================= |
michael@0 | 385 | |
michael@0 | 386 | DECLARE_REG 0, rdi |
michael@0 | 387 | DECLARE_REG 1, rsi |
michael@0 | 388 | DECLARE_REG 2, rdx |
michael@0 | 389 | DECLARE_REG 3, rcx |
michael@0 | 390 | DECLARE_REG 4, R8 |
michael@0 | 391 | DECLARE_REG 5, R9 |
michael@0 | 392 | DECLARE_REG 6, rax, 8 |
michael@0 | 393 | DECLARE_REG 7, R10, 16 |
michael@0 | 394 | DECLARE_REG 8, R11, 24 |
michael@0 | 395 | DECLARE_REG 9, rbx, 32 |
michael@0 | 396 | DECLARE_REG 10, rbp, 40 |
michael@0 | 397 | DECLARE_REG 11, R12, 48 |
michael@0 | 398 | DECLARE_REG 12, R13, 56 |
michael@0 | 399 | DECLARE_REG 13, R14, 64 |
michael@0 | 400 | DECLARE_REG 14, R15, 72 |
michael@0 | 401 | |
michael@0 | 402 | %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... |
michael@0 | 403 | %assign num_args %1 |
michael@0 | 404 | %assign regs_used %2 |
michael@0 | 405 | ASSERT regs_used >= num_args |
michael@0 | 406 | ASSERT regs_used <= 15 |
michael@0 | 407 | PUSH_IF_USED 9, 10, 11, 12, 13, 14 |
michael@0 | 408 | LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 |
michael@0 | 409 | DEFINE_ARGS %4 |
michael@0 | 410 | %endmacro |
michael@0 | 411 | |
michael@0 | 412 | %define has_epilogue regs_used > 9 || mmsize == 32 |
michael@0 | 413 | |
michael@0 | 414 | %macro RET 0 |
michael@0 | 415 | POP_IF_USED 14, 13, 12, 11, 10, 9 |
michael@0 | 416 | %if mmsize == 32 |
michael@0 | 417 | vzeroupper |
michael@0 | 418 | %endif |
michael@0 | 419 | ret |
michael@0 | 420 | %endmacro |
michael@0 | 421 | |
michael@0 | 422 | %else ; X86_32 ;============================================================== |
michael@0 | 423 | |
michael@0 | 424 | DECLARE_REG 0, eax, 4 |
michael@0 | 425 | DECLARE_REG 1, ecx, 8 |
michael@0 | 426 | DECLARE_REG 2, edx, 12 |
michael@0 | 427 | DECLARE_REG 3, ebx, 16 |
michael@0 | 428 | DECLARE_REG 4, esi, 20 |
michael@0 | 429 | DECLARE_REG 5, edi, 24 |
michael@0 | 430 | DECLARE_REG 6, ebp, 28 |
michael@0 | 431 | %define rsp esp |
michael@0 | 432 | |
michael@0 | 433 | %macro DECLARE_ARG 1-* |
michael@0 | 434 | %rep %0 |
michael@0 | 435 | %define r%1m [esp + stack_offset + 4*%1 + 4] |
michael@0 | 436 | %define r%1mp dword r%1m |
michael@0 | 437 | %rotate 1 |
michael@0 | 438 | %endrep |
michael@0 | 439 | %endmacro |
michael@0 | 440 | |
michael@0 | 441 | DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 |
michael@0 | 442 | |
michael@0 | 443 | %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... |
michael@0 | 444 | %assign num_args %1 |
michael@0 | 445 | %assign regs_used %2 |
michael@0 | 446 | %if regs_used > 7 |
michael@0 | 447 | %assign regs_used 7 |
michael@0 | 448 | %endif |
michael@0 | 449 | ASSERT regs_used >= num_args |
michael@0 | 450 | PUSH_IF_USED 3, 4, 5, 6 |
michael@0 | 451 | LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 |
michael@0 | 452 | DEFINE_ARGS %4 |
michael@0 | 453 | %endmacro |
michael@0 | 454 | |
michael@0 | 455 | %define has_epilogue regs_used > 3 || mmsize == 32 |
michael@0 | 456 | |
michael@0 | 457 | %macro RET 0 |
michael@0 | 458 | POP_IF_USED 6, 5, 4, 3 |
michael@0 | 459 | %if mmsize == 32 |
michael@0 | 460 | vzeroupper |
michael@0 | 461 | %endif |
michael@0 | 462 | ret |
michael@0 | 463 | %endmacro |
michael@0 | 464 | |
michael@0 | 465 | %endif ;====================================================================== |
michael@0 | 466 | |
michael@0 | 467 | %if WIN64 == 0 |
michael@0 | 468 | %macro WIN64_SPILL_XMM 1 |
michael@0 | 469 | %endmacro |
michael@0 | 470 | %macro WIN64_RESTORE_XMM 1 |
michael@0 | 471 | %endmacro |
michael@0 | 472 | %endif |
michael@0 | 473 | |
michael@0 | 474 | %macro REP_RET 0 |
michael@0 | 475 | %if has_epilogue |
michael@0 | 476 | RET |
michael@0 | 477 | %else |
michael@0 | 478 | rep ret |
michael@0 | 479 | %endif |
michael@0 | 480 | %endmacro |
michael@0 | 481 | |
michael@0 | 482 | %macro TAIL_CALL 2 ; callee, is_nonadjacent |
michael@0 | 483 | %if has_epilogue |
michael@0 | 484 | call %1 |
michael@0 | 485 | RET |
michael@0 | 486 | %elif %2 |
michael@0 | 487 | jmp %1 |
michael@0 | 488 | %endif |
michael@0 | 489 | %endmacro |
michael@0 | 490 | |
michael@0 | 491 | ;============================================================================= |
michael@0 | 492 | ; arch-independent part |
michael@0 | 493 | ;============================================================================= |
michael@0 | 494 | |
michael@0 | 495 | %assign function_align 16 |
michael@0 | 496 | |
michael@0 | 497 | ; Begin a function. |
michael@0 | 498 | ; Applies any symbol mangling needed for C linkage, and sets up a define such that |
michael@0 | 499 | ; subsequent uses of the function name automatically refer to the mangled version. |
michael@0 | 500 | ; Appends cpuflags to the function name if cpuflags has been specified. |
michael@0 | 501 | %macro cglobal 1-2+ ; name, [PROLOGUE args] |
michael@0 | 502 | %if %0 == 1 |
michael@0 | 503 | cglobal_internal %1 %+ SUFFIX |
michael@0 | 504 | %else |
michael@0 | 505 | cglobal_internal %1 %+ SUFFIX, %2 |
michael@0 | 506 | %endif |
michael@0 | 507 | %endmacro |
michael@0 | 508 | %macro cglobal_internal 1-2+ |
michael@0 | 509 | %ifndef cglobaled_%1 |
michael@0 | 510 | %xdefine %1 mangle(%1) |
michael@0 | 511 | %xdefine %1.skip_prologue %1 %+ .skip_prologue |
michael@0 | 512 | CAT_XDEFINE cglobaled_, %1, 1 |
michael@0 | 513 | %endif |
michael@0 | 514 | %xdefine current_function %1 |
michael@0 | 515 | %ifidn __OUTPUT_FORMAT__,elf |
michael@0 | 516 | global %1:function hidden |
michael@0 | 517 | %else |
michael@0 | 518 | global %1 |
michael@0 | 519 | %endif |
michael@0 | 520 | align function_align |
michael@0 | 521 | %1: |
michael@0 | 522 | RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer |
michael@0 | 523 | %assign stack_offset 0 |
michael@0 | 524 | %if %0 > 1 |
michael@0 | 525 | PROLOGUE %2 |
michael@0 | 526 | %endif |
michael@0 | 527 | %endmacro |
michael@0 | 528 | |
michael@0 | 529 | %macro cextern 1 |
michael@0 | 530 | %xdefine %1 mangle(%1) |
michael@0 | 531 | CAT_XDEFINE cglobaled_, %1, 1 |
michael@0 | 532 | extern %1 |
michael@0 | 533 | %endmacro |
michael@0 | 534 | |
michael@0 | 535 | ; like cextern, but without the prefix |
michael@0 | 536 | %macro cextern_naked 1 |
michael@0 | 537 | %xdefine %1 mangle(%1) |
michael@0 | 538 | CAT_XDEFINE cglobaled_, %1, 1 |
michael@0 | 539 | extern %1 |
michael@0 | 540 | %endmacro |
michael@0 | 541 | |
michael@0 | 542 | %macro const 2+ |
michael@0 | 543 | %xdefine %1 mangle(%1) |
michael@0 | 544 | global %1 |
michael@0 | 545 | %1: %2 |
michael@0 | 546 | %endmacro |
michael@0 | 547 | |
michael@0 | 548 | ; This is needed for ELF, otherwise the GNU linker assumes the stack is |
michael@0 | 549 | ; executable by default. |
michael@0 | 550 | %ifidn __OUTPUT_FORMAT__,elf |
michael@0 | 551 | SECTION .note.GNU-stack noalloc noexec nowrite progbits |
michael@0 | 552 | %endif |
michael@0 | 553 | %ifidn __OUTPUT_FORMAT__,elf32 |
michael@0 | 554 | section .note.GNU-stack noalloc noexec nowrite progbits |
michael@0 | 555 | %endif |
michael@0 | 556 | %ifidn __OUTPUT_FORMAT__,elf64 |
michael@0 | 557 | section .note.GNU-stack noalloc noexec nowrite progbits |
michael@0 | 558 | %endif |
michael@0 | 559 | |
michael@0 | 560 | ; cpuflags |
michael@0 | 561 | |
michael@0 | 562 | %assign cpuflags_MMX (1<<0) |
michael@0 | 563 | %assign cpuflags_MMX2 (1<<1) | cpuflags_MMX |
michael@0 | 564 | %assign cpuflags_3dnow (1<<2) | cpuflags_MMX |
michael@0 | 565 | %assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow |
michael@0 | 566 | %assign cpuflags_SSE (1<<4) | cpuflags_MMX2 |
michael@0 | 567 | %assign cpuflags_SSE2 (1<<5) | cpuflags_SSE |
michael@0 | 568 | %assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2 |
michael@0 | 569 | %assign cpuflags_SSE3 (1<<7) | cpuflags_SSE2 |
michael@0 | 570 | %assign cpuflags_SSSE3 (1<<8) | cpuflags_SSE3 |
michael@0 | 571 | %assign cpuflags_SSE4 (1<<9) | cpuflags_SSSE3 |
michael@0 | 572 | %assign cpuflags_SSE42 (1<<10)| cpuflags_SSE4 |
michael@0 | 573 | %assign cpuflags_AVX (1<<11)| cpuflags_SSE42 |
michael@0 | 574 | %assign cpuflags_xop (1<<12)| cpuflags_AVX |
michael@0 | 575 | %assign cpuflags_fma4 (1<<13)| cpuflags_AVX |
michael@0 | 576 | %assign cpuflags_AVX2 (1<<14)| cpuflags_AVX |
michael@0 | 577 | %assign cpuflags_fma3 (1<<15)| cpuflags_AVX |
michael@0 | 578 | |
michael@0 | 579 | %assign cpuflags_cache32 (1<<16) |
michael@0 | 580 | %assign cpuflags_cache64 (1<<17) |
michael@0 | 581 | %assign cpuflags_slowctz (1<<18) |
michael@0 | 582 | %assign cpuflags_lzcnt (1<<19) |
michael@0 | 583 | %assign cpuflags_misalign (1<<20) |
michael@0 | 584 | %assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant |
michael@0 | 585 | %assign cpuflags_atom (1<<22) |
michael@0 | 586 | %assign cpuflags_bmi1 (1<<23) |
michael@0 | 587 | %assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1 |
michael@0 | 588 | %assign cpuflags_tbm (1<<25)|cpuflags_bmi1 |
michael@0 | 589 | |
michael@0 | 590 | %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) |
michael@0 | 591 | %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) |
michael@0 | 592 | |
michael@0 | 593 | ; Takes up to 2 cpuflags from the above list. |
michael@0 | 594 | ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. |
michael@0 | 595 | ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. |
michael@0 | 596 | %macro INIT_CPUFLAGS 0-2 |
michael@0 | 597 | %if %0 >= 1 |
michael@0 | 598 | %xdefine cpuname %1 |
michael@0 | 599 | %assign cpuflags cpuflags_%1 |
michael@0 | 600 | %if %0 >= 2 |
michael@0 | 601 | %xdefine cpuname %1_%2 |
michael@0 | 602 | %assign cpuflags cpuflags | cpuflags_%2 |
michael@0 | 603 | %endif |
michael@0 | 604 | %xdefine SUFFIX _ %+ cpuname |
michael@0 | 605 | %if cpuflag(AVX) |
michael@0 | 606 | %assign AVX_enabled 1 |
michael@0 | 607 | %endif |
michael@0 | 608 | %if mmsize == 16 && notcpuflag(SSE2) |
michael@0 | 609 | %define mova movaps |
michael@0 | 610 | %define movu movups |
michael@0 | 611 | %define movnta movntps |
michael@0 | 612 | %endif |
michael@0 | 613 | %if cpuflag(aligned) |
michael@0 | 614 | %define movu mova |
michael@0 | 615 | %elifidn %1, SSE3 |
michael@0 | 616 | %define movu lddqu |
michael@0 | 617 | %endif |
michael@0 | 618 | %else |
michael@0 | 619 | %xdefine SUFFIX |
michael@0 | 620 | %undef cpuname |
michael@0 | 621 | %undef cpuflags |
michael@0 | 622 | %endif |
michael@0 | 623 | %endmacro |
michael@0 | 624 | |
michael@0 | 625 | ; merge MMX and SSE* |
michael@0 | 626 | |
michael@0 | 627 | %macro CAT_XDEFINE 3 |
michael@0 | 628 | %xdefine %1%2 %3 |
michael@0 | 629 | %endmacro |
michael@0 | 630 | |
michael@0 | 631 | %macro CAT_UNDEF 2 |
michael@0 | 632 | %undef %1%2 |
michael@0 | 633 | %endmacro |
michael@0 | 634 | |
michael@0 | 635 | %macro INIT_MMX 0-1+ |
michael@0 | 636 | %assign AVX_enabled 0 |
michael@0 | 637 | %define RESET_MM_PERMUTATION INIT_MMX %1 |
michael@0 | 638 | %define mmsize 8 |
michael@0 | 639 | %define num_mmregs 8 |
michael@0 | 640 | %define mova movq |
michael@0 | 641 | %define movu movq |
michael@0 | 642 | %define movh movd |
michael@0 | 643 | %define movnta movntq |
michael@0 | 644 | %assign %%i 0 |
michael@0 | 645 | %rep 8 |
michael@0 | 646 | CAT_XDEFINE m, %%i, mm %+ %%i |
michael@0 | 647 | CAT_XDEFINE nmm, %%i, %%i |
michael@0 | 648 | %assign %%i %%i+1 |
michael@0 | 649 | %endrep |
michael@0 | 650 | %rep 8 |
michael@0 | 651 | CAT_UNDEF m, %%i |
michael@0 | 652 | CAT_UNDEF nmm, %%i |
michael@0 | 653 | %assign %%i %%i+1 |
michael@0 | 654 | %endrep |
michael@0 | 655 | INIT_CPUFLAGS %1 |
michael@0 | 656 | %endmacro |
michael@0 | 657 | |
michael@0 | 658 | %macro INIT_XMM 0-1+ |
michael@0 | 659 | %assign AVX_enabled 0 |
michael@0 | 660 | %define RESET_MM_PERMUTATION INIT_XMM %1 |
michael@0 | 661 | %define mmsize 16 |
michael@0 | 662 | %define num_mmregs 8 |
michael@0 | 663 | %if ARCH_X86_64 |
michael@0 | 664 | %define num_mmregs 16 |
michael@0 | 665 | %endif |
michael@0 | 666 | %define mova movdqa |
michael@0 | 667 | %define movu movdqu |
michael@0 | 668 | %define movh movq |
michael@0 | 669 | %define movnta movntdq |
michael@0 | 670 | %assign %%i 0 |
michael@0 | 671 | %rep num_mmregs |
michael@0 | 672 | CAT_XDEFINE m, %%i, xmm %+ %%i |
michael@0 | 673 | CAT_XDEFINE nxmm, %%i, %%i |
michael@0 | 674 | %assign %%i %%i+1 |
michael@0 | 675 | %endrep |
michael@0 | 676 | INIT_CPUFLAGS %1 |
michael@0 | 677 | %endmacro |
michael@0 | 678 | |
michael@0 | 679 | %macro INIT_YMM 0-1+ |
michael@0 | 680 | %assign AVX_enabled 1 |
michael@0 | 681 | %define RESET_MM_PERMUTATION INIT_YMM %1 |
michael@0 | 682 | %define mmsize 32 |
michael@0 | 683 | %define num_mmregs 8 |
michael@0 | 684 | %if ARCH_X86_64 |
michael@0 | 685 | %define num_mmregs 16 |
michael@0 | 686 | %endif |
michael@0 | 687 | %define mova vmovaps |
michael@0 | 688 | %define movu vmovups |
michael@0 | 689 | %undef movh |
michael@0 | 690 | %define movnta vmovntps |
michael@0 | 691 | %assign %%i 0 |
michael@0 | 692 | %rep num_mmregs |
michael@0 | 693 | CAT_XDEFINE m, %%i, ymm %+ %%i |
michael@0 | 694 | CAT_XDEFINE nymm, %%i, %%i |
michael@0 | 695 | %assign %%i %%i+1 |
michael@0 | 696 | %endrep |
michael@0 | 697 | INIT_CPUFLAGS %1 |
michael@0 | 698 | %endmacro |
michael@0 | 699 | |
michael@0 | 700 | INIT_XMM |
michael@0 | 701 | |
michael@0 | 702 | ; I often want to use macros that permute their arguments. e.g. there's no |
michael@0 | 703 | ; efficient way to implement butterfly or transpose or dct without swapping some |
michael@0 | 704 | ; arguments. |
michael@0 | 705 | ; |
michael@0 | 706 | ; I would like to not have to manually keep track of the permutations: |
michael@0 | 707 | ; If I insert a permutation in the middle of a function, it should automatically |
michael@0 | 708 | ; change everything that follows. For more complex macros I may also have multiple |
michael@0 | 709 | ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. |
michael@0 | 710 | ; |
michael@0 | 711 | ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that |
michael@0 | 712 | ; permutes its arguments. It's equivalent to exchanging the contents of the |
michael@0 | 713 | ; registers, except that this way you exchange the register names instead, so it |
michael@0 | 714 | ; doesn't cost any cycles. |
michael@0 | 715 | |
michael@0 | 716 | %macro PERMUTE 2-* ; takes a list of pairs to swap |
michael@0 | 717 | %rep %0/2 |
michael@0 | 718 | %xdefine tmp%2 m%2 |
michael@0 | 719 | %xdefine ntmp%2 nm%2 |
michael@0 | 720 | %rotate 2 |
michael@0 | 721 | %endrep |
michael@0 | 722 | %rep %0/2 |
michael@0 | 723 | %xdefine m%1 tmp%2 |
michael@0 | 724 | %xdefine nm%1 ntmp%2 |
michael@0 | 725 | %undef tmp%2 |
michael@0 | 726 | %undef ntmp%2 |
michael@0 | 727 | %rotate 2 |
michael@0 | 728 | %endrep |
michael@0 | 729 | %endmacro |
michael@0 | 730 | |
michael@0 | 731 | %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) |
michael@0 | 732 | %rep %0-1 |
michael@0 | 733 | %ifdef m%1 |
michael@0 | 734 | %xdefine tmp m%1 |
michael@0 | 735 | %xdefine m%1 m%2 |
michael@0 | 736 | %xdefine m%2 tmp |
michael@0 | 737 | CAT_XDEFINE n, m%1, %1 |
michael@0 | 738 | CAT_XDEFINE n, m%2, %2 |
michael@0 | 739 | %else |
michael@0 | 740 | ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. |
michael@0 | 741 | ; Be careful using this mode in nested macros though, as in some cases there may be |
michael@0 | 742 | ; other copies of m# that have already been dereferenced and don't get updated correctly. |
michael@0 | 743 | %xdefine %%n1 n %+ %1 |
michael@0 | 744 | %xdefine %%n2 n %+ %2 |
michael@0 | 745 | %xdefine tmp m %+ %%n1 |
michael@0 | 746 | CAT_XDEFINE m, %%n1, m %+ %%n2 |
michael@0 | 747 | CAT_XDEFINE m, %%n2, tmp |
michael@0 | 748 | CAT_XDEFINE n, m %+ %%n1, %%n1 |
michael@0 | 749 | CAT_XDEFINE n, m %+ %%n2, %%n2 |
michael@0 | 750 | %endif |
michael@0 | 751 | %undef tmp |
michael@0 | 752 | %rotate 1 |
michael@0 | 753 | %endrep |
michael@0 | 754 | %endmacro |
michael@0 | 755 | |
michael@0 | 756 | ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later |
michael@0 | 757 | ; calls to that function will automatically load the permutation, so values can |
michael@0 | 758 | ; be returned in mmregs. |
michael@0 | 759 | %macro SAVE_MM_PERMUTATION 0-1 |
michael@0 | 760 | %if %0 |
michael@0 | 761 | %xdefine %%f %1_m |
michael@0 | 762 | %else |
michael@0 | 763 | %xdefine %%f current_function %+ _m |
michael@0 | 764 | %endif |
michael@0 | 765 | %assign %%i 0 |
michael@0 | 766 | %rep num_mmregs |
michael@0 | 767 | CAT_XDEFINE %%f, %%i, m %+ %%i |
michael@0 | 768 | %assign %%i %%i+1 |
michael@0 | 769 | %endrep |
michael@0 | 770 | %endmacro |
michael@0 | 771 | |
michael@0 | 772 | %macro LOAD_MM_PERMUTATION 1 ; name to load from |
michael@0 | 773 | %ifdef %1_m0 |
michael@0 | 774 | %assign %%i 0 |
michael@0 | 775 | %rep num_mmregs |
michael@0 | 776 | CAT_XDEFINE m, %%i, %1_m %+ %%i |
michael@0 | 777 | CAT_XDEFINE n, m %+ %%i, %%i |
michael@0 | 778 | %assign %%i %%i+1 |
michael@0 | 779 | %endrep |
michael@0 | 780 | %endif |
michael@0 | 781 | %endmacro |
michael@0 | 782 | |
michael@0 | 783 | ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't |
michael@0 | 784 | %macro call 1 |
michael@0 | 785 | call_internal %1, %1 %+ SUFFIX |
michael@0 | 786 | %endmacro |
michael@0 | 787 | %macro call_internal 2 |
michael@0 | 788 | %xdefine %%i %1 |
michael@0 | 789 | %ifndef cglobaled_%1 |
michael@0 | 790 | %ifdef cglobaled_%2 |
michael@0 | 791 | %xdefine %%i %2 |
michael@0 | 792 | %endif |
michael@0 | 793 | %endif |
michael@0 | 794 | call %%i |
michael@0 | 795 | LOAD_MM_PERMUTATION %%i |
michael@0 | 796 | %endmacro |
michael@0 | 797 | |
michael@0 | 798 | ; Substitutions that reduce instruction size but are functionally equivalent |
michael@0 | 799 | %macro add 2 |
michael@0 | 800 | %ifnum %2 |
michael@0 | 801 | %if %2==128 |
michael@0 | 802 | sub %1, -128 |
michael@0 | 803 | %else |
michael@0 | 804 | add %1, %2 |
michael@0 | 805 | %endif |
michael@0 | 806 | %else |
michael@0 | 807 | add %1, %2 |
michael@0 | 808 | %endif |
michael@0 | 809 | %endmacro |
michael@0 | 810 | |
michael@0 | 811 | %macro sub 2 |
michael@0 | 812 | %ifnum %2 |
michael@0 | 813 | %if %2==128 |
michael@0 | 814 | add %1, -128 |
michael@0 | 815 | %else |
michael@0 | 816 | sub %1, %2 |
michael@0 | 817 | %endif |
michael@0 | 818 | %else |
michael@0 | 819 | sub %1, %2 |
michael@0 | 820 | %endif |
michael@0 | 821 | %endmacro |
michael@0 | 822 | |
michael@0 | 823 | ;============================================================================= |
michael@0 | 824 | ; AVX abstraction layer |
michael@0 | 825 | ;============================================================================= |
michael@0 | 826 | |
michael@0 | 827 | %assign i 0 |
michael@0 | 828 | %rep 16 |
michael@0 | 829 | %if i < 8 |
michael@0 | 830 | CAT_XDEFINE sizeofmm, i, 8 |
michael@0 | 831 | %endif |
michael@0 | 832 | CAT_XDEFINE sizeofxmm, i, 16 |
michael@0 | 833 | CAT_XDEFINE sizeofymm, i, 32 |
michael@0 | 834 | %assign i i+1 |
michael@0 | 835 | %endrep |
michael@0 | 836 | %undef i |
michael@0 | 837 | |
michael@0 | 838 | %macro CHECK_AVX_INSTR_EMU 3-* |
michael@0 | 839 | %xdefine %%opcode %1 |
michael@0 | 840 | %xdefine %%dst %2 |
michael@0 | 841 | %rep %0-2 |
michael@0 | 842 | %ifidn %%dst, %3 |
michael@0 | 843 | %error non-AVX emulation of ``%%opcode'' is not supported |
michael@0 | 844 | %endif |
michael@0 | 845 | %rotate 1 |
michael@0 | 846 | %endrep |
michael@0 | 847 | %endmacro |
michael@0 | 848 | |
michael@0 | 849 | ;%1 == instruction |
michael@0 | 850 | ;%2 == 1 if float, 0 if int |
michael@0 | 851 | ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) |
michael@0 | 852 | ;%4 == number of operands given |
michael@0 | 853 | ;%5+: operands |
michael@0 | 854 | %macro RUN_AVX_INSTR 6-7+ |
michael@0 | 855 | %ifid %6 |
michael@0 | 856 | %define %%sizeofreg sizeof%6 |
michael@0 | 857 | %elifid %5 |
michael@0 | 858 | %define %%sizeofreg sizeof%5 |
michael@0 | 859 | %else |
michael@0 | 860 | %define %%sizeofreg mmsize |
michael@0 | 861 | %endif |
michael@0 | 862 | %if %%sizeofreg==32 |
michael@0 | 863 | %if %4>=3 |
michael@0 | 864 | v%1 %5, %6, %7 |
michael@0 | 865 | %else |
michael@0 | 866 | v%1 %5, %6 |
michael@0 | 867 | %endif |
michael@0 | 868 | %else |
michael@0 | 869 | %if %%sizeofreg==8 |
michael@0 | 870 | %define %%regmov movq |
michael@0 | 871 | %elif %2 |
michael@0 | 872 | %define %%regmov movaps |
michael@0 | 873 | %else |
michael@0 | 874 | %define %%regmov movdqa |
michael@0 | 875 | %endif |
michael@0 | 876 | |
michael@0 | 877 | %if %4>=3+%3 |
michael@0 | 878 | %ifnidn %5, %6 |
michael@0 | 879 | %if AVX_enabled && %%sizeofreg==16 |
michael@0 | 880 | v%1 %5, %6, %7 |
michael@0 | 881 | %else |
michael@0 | 882 | CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7 |
michael@0 | 883 | %%regmov %5, %6 |
michael@0 | 884 | %1 %5, %7 |
michael@0 | 885 | %endif |
michael@0 | 886 | %else |
michael@0 | 887 | %1 %5, %7 |
michael@0 | 888 | %endif |
michael@0 | 889 | %elif %4>=3 |
michael@0 | 890 | %1 %5, %6, %7 |
michael@0 | 891 | %else |
michael@0 | 892 | %1 %5, %6 |
michael@0 | 893 | %endif |
michael@0 | 894 | %endif |
michael@0 | 895 | %endmacro |
michael@0 | 896 | |
michael@0 | 897 | ; 3arg AVX ops with a memory arg can only have it in src2, |
michael@0 | 898 | ; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov). |
michael@0 | 899 | ; So, if the op is symmetric and the wrong one is memory, swap them. |
michael@0 | 900 | %macro RUN_AVX_INSTR1 8 |
michael@0 | 901 | %assign %%swap 0 |
michael@0 | 902 | %if AVX_enabled |
michael@0 | 903 | %ifnid %6 |
michael@0 | 904 | %assign %%swap 1 |
michael@0 | 905 | %endif |
michael@0 | 906 | %elifnidn %5, %6 |
michael@0 | 907 | %ifnid %7 |
michael@0 | 908 | %assign %%swap 1 |
michael@0 | 909 | %endif |
michael@0 | 910 | %endif |
michael@0 | 911 | %if %%swap && %3 == 0 && %8 == 1 |
michael@0 | 912 | RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6 |
michael@0 | 913 | %else |
michael@0 | 914 | RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7 |
michael@0 | 915 | %endif |
michael@0 | 916 | %endmacro |
michael@0 | 917 | |
michael@0 | 918 | ;%1 == instruction |
michael@0 | 919 | ;%2 == 1 if float, 0 if int |
michael@0 | 920 | ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) |
michael@0 | 921 | ;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not |
michael@0 | 922 | %macro AVX_INSTR 4 |
michael@0 | 923 | %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 |
michael@0 | 924 | %ifidn %3, fnord |
michael@0 | 925 | RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 |
michael@0 | 926 | %elifidn %4, fnord |
michael@0 | 927 | RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 |
michael@0 | 928 | %elifidn %5, fnord |
michael@0 | 929 | RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 |
michael@0 | 930 | %else |
michael@0 | 931 | RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 |
michael@0 | 932 | %endif |
michael@0 | 933 | %endmacro |
michael@0 | 934 | %endmacro |
michael@0 | 935 | |
michael@0 | 936 | AVX_INSTR addpd, 1, 0, 1 |
michael@0 | 937 | AVX_INSTR addps, 1, 0, 1 |
michael@0 | 938 | AVX_INSTR addsd, 1, 0, 1 |
michael@0 | 939 | AVX_INSTR addss, 1, 0, 1 |
michael@0 | 940 | AVX_INSTR addsubpd, 1, 0, 0 |
michael@0 | 941 | AVX_INSTR addsubps, 1, 0, 0 |
michael@0 | 942 | AVX_INSTR andpd, 1, 0, 1 |
michael@0 | 943 | AVX_INSTR andps, 1, 0, 1 |
michael@0 | 944 | AVX_INSTR andnpd, 1, 0, 0 |
michael@0 | 945 | AVX_INSTR andnps, 1, 0, 0 |
michael@0 | 946 | AVX_INSTR blendpd, 1, 0, 0 |
michael@0 | 947 | AVX_INSTR blendps, 1, 0, 0 |
michael@0 | 948 | AVX_INSTR blendvpd, 1, 0, 0 |
michael@0 | 949 | AVX_INSTR blendvps, 1, 0, 0 |
michael@0 | 950 | AVX_INSTR cmppd, 1, 0, 0 |
michael@0 | 951 | AVX_INSTR cmpps, 1, 0, 0 |
michael@0 | 952 | AVX_INSTR cmpsd, 1, 0, 0 |
michael@0 | 953 | AVX_INSTR cmpss, 1, 0, 0 |
michael@0 | 954 | AVX_INSTR cvtdq2ps, 1, 0, 0 |
michael@0 | 955 | AVX_INSTR cvtps2dq, 1, 0, 0 |
michael@0 | 956 | AVX_INSTR divpd, 1, 0, 0 |
michael@0 | 957 | AVX_INSTR divps, 1, 0, 0 |
michael@0 | 958 | AVX_INSTR divsd, 1, 0, 0 |
michael@0 | 959 | AVX_INSTR divss, 1, 0, 0 |
michael@0 | 960 | AVX_INSTR dppd, 1, 1, 0 |
michael@0 | 961 | AVX_INSTR dpps, 1, 1, 0 |
michael@0 | 962 | AVX_INSTR haddpd, 1, 0, 0 |
michael@0 | 963 | AVX_INSTR haddps, 1, 0, 0 |
michael@0 | 964 | AVX_INSTR hsubpd, 1, 0, 0 |
michael@0 | 965 | AVX_INSTR hsubps, 1, 0, 0 |
michael@0 | 966 | AVX_INSTR maxpd, 1, 0, 1 |
michael@0 | 967 | AVX_INSTR maxps, 1, 0, 1 |
michael@0 | 968 | AVX_INSTR maxsd, 1, 0, 1 |
michael@0 | 969 | AVX_INSTR maxss, 1, 0, 1 |
michael@0 | 970 | AVX_INSTR minpd, 1, 0, 1 |
michael@0 | 971 | AVX_INSTR minps, 1, 0, 1 |
michael@0 | 972 | AVX_INSTR minsd, 1, 0, 1 |
michael@0 | 973 | AVX_INSTR minss, 1, 0, 1 |
michael@0 | 974 | AVX_INSTR movhlps, 1, 0, 0 |
michael@0 | 975 | AVX_INSTR movlhps, 1, 0, 0 |
michael@0 | 976 | AVX_INSTR movsd, 1, 0, 0 |
michael@0 | 977 | AVX_INSTR movss, 1, 0, 0 |
michael@0 | 978 | AVX_INSTR mpsadbw, 0, 1, 0 |
michael@0 | 979 | AVX_INSTR mulpd, 1, 0, 1 |
michael@0 | 980 | AVX_INSTR mulps, 1, 0, 1 |
michael@0 | 981 | AVX_INSTR mulsd, 1, 0, 1 |
michael@0 | 982 | AVX_INSTR mulss, 1, 0, 1 |
michael@0 | 983 | AVX_INSTR orpd, 1, 0, 1 |
michael@0 | 984 | AVX_INSTR orps, 1, 0, 1 |
michael@0 | 985 | AVX_INSTR pabsb, 0, 0, 0 |
michael@0 | 986 | AVX_INSTR pabsw, 0, 0, 0 |
michael@0 | 987 | AVX_INSTR pabsd, 0, 0, 0 |
michael@0 | 988 | AVX_INSTR packsswb, 0, 0, 0 |
michael@0 | 989 | AVX_INSTR packssdw, 0, 0, 0 |
michael@0 | 990 | AVX_INSTR packuswb, 0, 0, 0 |
michael@0 | 991 | AVX_INSTR packusdw, 0, 0, 0 |
michael@0 | 992 | AVX_INSTR paddb, 0, 0, 1 |
michael@0 | 993 | AVX_INSTR paddw, 0, 0, 1 |
michael@0 | 994 | AVX_INSTR paddd, 0, 0, 1 |
michael@0 | 995 | AVX_INSTR paddq, 0, 0, 1 |
michael@0 | 996 | AVX_INSTR paddsb, 0, 0, 1 |
michael@0 | 997 | AVX_INSTR paddsw, 0, 0, 1 |
michael@0 | 998 | AVX_INSTR paddusb, 0, 0, 1 |
michael@0 | 999 | AVX_INSTR paddusw, 0, 0, 1 |
michael@0 | 1000 | AVX_INSTR palignr, 0, 1, 0 |
michael@0 | 1001 | AVX_INSTR pand, 0, 0, 1 |
michael@0 | 1002 | AVX_INSTR pandn, 0, 0, 0 |
michael@0 | 1003 | AVX_INSTR pavgb, 0, 0, 1 |
michael@0 | 1004 | AVX_INSTR pavgw, 0, 0, 1 |
michael@0 | 1005 | AVX_INSTR pblendvb, 0, 0, 0 |
michael@0 | 1006 | AVX_INSTR pblendw, 0, 1, 0 |
michael@0 | 1007 | AVX_INSTR pcmpestri, 0, 0, 0 |
michael@0 | 1008 | AVX_INSTR pcmpestrm, 0, 0, 0 |
michael@0 | 1009 | AVX_INSTR pcmpistri, 0, 0, 0 |
michael@0 | 1010 | AVX_INSTR pcmpistrm, 0, 0, 0 |
michael@0 | 1011 | AVX_INSTR pcmpeqb, 0, 0, 1 |
michael@0 | 1012 | AVX_INSTR pcmpeqw, 0, 0, 1 |
michael@0 | 1013 | AVX_INSTR pcmpeqd, 0, 0, 1 |
michael@0 | 1014 | AVX_INSTR pcmpeqq, 0, 0, 1 |
michael@0 | 1015 | AVX_INSTR pcmpgtb, 0, 0, 0 |
michael@0 | 1016 | AVX_INSTR pcmpgtw, 0, 0, 0 |
michael@0 | 1017 | AVX_INSTR pcmpgtd, 0, 0, 0 |
michael@0 | 1018 | AVX_INSTR pcmpgtq, 0, 0, 0 |
michael@0 | 1019 | AVX_INSTR phaddw, 0, 0, 0 |
michael@0 | 1020 | AVX_INSTR phaddd, 0, 0, 0 |
michael@0 | 1021 | AVX_INSTR phaddsw, 0, 0, 0 |
michael@0 | 1022 | AVX_INSTR phsubw, 0, 0, 0 |
michael@0 | 1023 | AVX_INSTR phsubd, 0, 0, 0 |
michael@0 | 1024 | AVX_INSTR phsubsw, 0, 0, 0 |
michael@0 | 1025 | AVX_INSTR pmaddwd, 0, 0, 1 |
michael@0 | 1026 | AVX_INSTR pmaddubsw, 0, 0, 0 |
michael@0 | 1027 | AVX_INSTR pmaxsb, 0, 0, 1 |
michael@0 | 1028 | AVX_INSTR pmaxsw, 0, 0, 1 |
michael@0 | 1029 | AVX_INSTR pmaxsd, 0, 0, 1 |
michael@0 | 1030 | AVX_INSTR pmaxub, 0, 0, 1 |
michael@0 | 1031 | AVX_INSTR pmaxuw, 0, 0, 1 |
michael@0 | 1032 | AVX_INSTR pmaxud, 0, 0, 1 |
michael@0 | 1033 | AVX_INSTR pminsb, 0, 0, 1 |
michael@0 | 1034 | AVX_INSTR pminsw, 0, 0, 1 |
michael@0 | 1035 | AVX_INSTR pminsd, 0, 0, 1 |
michael@0 | 1036 | AVX_INSTR pminub, 0, 0, 1 |
michael@0 | 1037 | AVX_INSTR pminuw, 0, 0, 1 |
michael@0 | 1038 | AVX_INSTR pminud, 0, 0, 1 |
michael@0 | 1039 | AVX_INSTR pmovmskb, 0, 0, 0 |
michael@0 | 1040 | AVX_INSTR pmulhuw, 0, 0, 1 |
michael@0 | 1041 | AVX_INSTR pmulhrsw, 0, 0, 1 |
michael@0 | 1042 | AVX_INSTR pmulhw, 0, 0, 1 |
michael@0 | 1043 | AVX_INSTR pmullw, 0, 0, 1 |
michael@0 | 1044 | AVX_INSTR pmulld, 0, 0, 1 |
michael@0 | 1045 | AVX_INSTR pmuludq, 0, 0, 1 |
michael@0 | 1046 | AVX_INSTR pmuldq, 0, 0, 1 |
michael@0 | 1047 | AVX_INSTR por, 0, 0, 1 |
michael@0 | 1048 | AVX_INSTR psadbw, 0, 0, 1 |
michael@0 | 1049 | AVX_INSTR pshufb, 0, 0, 0 |
michael@0 | 1050 | AVX_INSTR pshufd, 0, 1, 0 |
michael@0 | 1051 | AVX_INSTR pshufhw, 0, 1, 0 |
michael@0 | 1052 | AVX_INSTR pshuflw, 0, 1, 0 |
michael@0 | 1053 | AVX_INSTR psignb, 0, 0, 0 |
michael@0 | 1054 | AVX_INSTR psignw, 0, 0, 0 |
michael@0 | 1055 | AVX_INSTR psignd, 0, 0, 0 |
michael@0 | 1056 | AVX_INSTR psllw, 0, 0, 0 |
michael@0 | 1057 | AVX_INSTR pslld, 0, 0, 0 |
michael@0 | 1058 | AVX_INSTR psllq, 0, 0, 0 |
michael@0 | 1059 | AVX_INSTR pslldq, 0, 0, 0 |
michael@0 | 1060 | AVX_INSTR psraw, 0, 0, 0 |
michael@0 | 1061 | AVX_INSTR psrad, 0, 0, 0 |
michael@0 | 1062 | AVX_INSTR psrlw, 0, 0, 0 |
michael@0 | 1063 | AVX_INSTR psrld, 0, 0, 0 |
michael@0 | 1064 | AVX_INSTR psrlq, 0, 0, 0 |
michael@0 | 1065 | AVX_INSTR psrldq, 0, 0, 0 |
michael@0 | 1066 | AVX_INSTR psubb, 0, 0, 0 |
michael@0 | 1067 | AVX_INSTR psubw, 0, 0, 0 |
michael@0 | 1068 | AVX_INSTR psubd, 0, 0, 0 |
michael@0 | 1069 | AVX_INSTR psubq, 0, 0, 0 |
michael@0 | 1070 | AVX_INSTR psubsb, 0, 0, 0 |
michael@0 | 1071 | AVX_INSTR psubsw, 0, 0, 0 |
michael@0 | 1072 | AVX_INSTR psubusb, 0, 0, 0 |
michael@0 | 1073 | AVX_INSTR psubusw, 0, 0, 0 |
michael@0 | 1074 | AVX_INSTR ptest, 0, 0, 0 |
michael@0 | 1075 | AVX_INSTR punpckhbw, 0, 0, 0 |
michael@0 | 1076 | AVX_INSTR punpckhwd, 0, 0, 0 |
michael@0 | 1077 | AVX_INSTR punpckhdq, 0, 0, 0 |
michael@0 | 1078 | AVX_INSTR punpckhqdq, 0, 0, 0 |
michael@0 | 1079 | AVX_INSTR punpcklbw, 0, 0, 0 |
michael@0 | 1080 | AVX_INSTR punpcklwd, 0, 0, 0 |
michael@0 | 1081 | AVX_INSTR punpckldq, 0, 0, 0 |
michael@0 | 1082 | AVX_INSTR punpcklqdq, 0, 0, 0 |
michael@0 | 1083 | AVX_INSTR pxor, 0, 0, 1 |
michael@0 | 1084 | AVX_INSTR shufps, 1, 1, 0 |
michael@0 | 1085 | AVX_INSTR subpd, 1, 0, 0 |
michael@0 | 1086 | AVX_INSTR subps, 1, 0, 0 |
michael@0 | 1087 | AVX_INSTR subsd, 1, 0, 0 |
michael@0 | 1088 | AVX_INSTR subss, 1, 0, 0 |
michael@0 | 1089 | AVX_INSTR unpckhpd, 1, 0, 0 |
michael@0 | 1090 | AVX_INSTR unpckhps, 1, 0, 0 |
michael@0 | 1091 | AVX_INSTR unpcklpd, 1, 0, 0 |
michael@0 | 1092 | AVX_INSTR unpcklps, 1, 0, 0 |
michael@0 | 1093 | AVX_INSTR xorpd, 1, 0, 1 |
michael@0 | 1094 | AVX_INSTR xorps, 1, 0, 1 |
michael@0 | 1095 | |
michael@0 | 1096 | ; 3DNow instructions, for sharing code between AVX, SSE and 3DN |
michael@0 | 1097 | AVX_INSTR pfadd, 1, 0, 1 |
michael@0 | 1098 | AVX_INSTR pfsub, 1, 0, 0 |
michael@0 | 1099 | AVX_INSTR pfmul, 1, 0, 1 |
michael@0 | 1100 | |
michael@0 | 1101 | ; base-4 constants for shuffles |
michael@0 | 1102 | %assign i 0 |
michael@0 | 1103 | %rep 256 |
michael@0 | 1104 | %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) |
michael@0 | 1105 | %if j < 10 |
michael@0 | 1106 | CAT_XDEFINE q000, j, i |
michael@0 | 1107 | %elif j < 100 |
michael@0 | 1108 | CAT_XDEFINE q00, j, i |
michael@0 | 1109 | %elif j < 1000 |
michael@0 | 1110 | CAT_XDEFINE q0, j, i |
michael@0 | 1111 | %else |
michael@0 | 1112 | CAT_XDEFINE q, j, i |
michael@0 | 1113 | %endif |
michael@0 | 1114 | %assign i i+1 |
michael@0 | 1115 | %endrep |
michael@0 | 1116 | %undef i |
michael@0 | 1117 | %undef j |
michael@0 | 1118 | |
michael@0 | 1119 | %macro FMA_INSTR 3 |
michael@0 | 1120 | %macro %1 4-7 %1, %2, %3 |
michael@0 | 1121 | %if cpuflag(xop) |
michael@0 | 1122 | v%5 %1, %2, %3, %4 |
michael@0 | 1123 | %else |
michael@0 | 1124 | %6 %1, %2, %3 |
michael@0 | 1125 | %7 %1, %4 |
michael@0 | 1126 | %endif |
michael@0 | 1127 | %endmacro |
michael@0 | 1128 | %endmacro |
michael@0 | 1129 | |
michael@0 | 1130 | FMA_INSTR pmacsdd, pmulld, paddd |
michael@0 | 1131 | FMA_INSTR pmacsww, pmullw, paddw |
michael@0 | 1132 | FMA_INSTR pmadcswd, pmaddwd, paddd |
michael@0 | 1133 | |
michael@0 | 1134 | ; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf. |
michael@0 | 1135 | ; This lets us use tzcnt without bumping the yasm version requirement yet. |
michael@0 | 1136 | %define tzcnt rep bsf |