Thu, 15 Jan 2015 15:59:08 +0100
Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
michael@0 | 1 | ;***************************************************************************** |
michael@0 | 2 | ;* x86inc.asm: x264asm abstraction layer |
michael@0 | 3 | ;***************************************************************************** |
michael@0 | 4 | ;* Copyright (C) 2005-2012 x264 project |
michael@0 | 5 | ;* |
michael@0 | 6 | ;* Authors: Loren Merritt <lorenm@u.washington.edu> |
michael@0 | 7 | ;* Anton Mitrofanov <BugMaster@narod.ru> |
michael@0 | 8 | ;* Jason Garrett-Glaser <darkshikari@gmail.com> |
michael@0 | 9 | ;* Henrik Gramner <hengar-6@student.ltu.se> |
michael@0 | 10 | ;* |
michael@0 | 11 | ;* Permission to use, copy, modify, and/or distribute this software for any |
michael@0 | 12 | ;* purpose with or without fee is hereby granted, provided that the above |
michael@0 | 13 | ;* copyright notice and this permission notice appear in all copies. |
michael@0 | 14 | ;* |
michael@0 | 15 | ;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES |
michael@0 | 16 | ;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF |
michael@0 | 17 | ;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR |
michael@0 | 18 | ;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
michael@0 | 19 | ;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN |
michael@0 | 20 | ;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF |
michael@0 | 21 | ;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. |
michael@0 | 22 | ;***************************************************************************** |
michael@0 | 23 | |
michael@0 | 24 | ; This is a header file for the x264ASM assembly language, which uses |
michael@0 | 25 | ; NASM/YASM syntax combined with a large number of macros to provide easy |
michael@0 | 26 | ; abstraction between different calling conventions (x86_32, win64, linux64). |
michael@0 | 27 | ; It also has various other useful features to simplify writing the kind of |
michael@0 | 28 | ; DSP functions that are most often used in x264. |
michael@0 | 29 | |
michael@0 | 30 | ; Unlike the rest of x264, this file is available under an ISC license, as it |
michael@0 | 31 | ; has significant usefulness outside of x264 and we want it to be available |
michael@0 | 32 | ; to the largest audience possible. Of course, if you modify it for your own |
michael@0 | 33 | ; purposes to add a new feature, we strongly encourage contributing a patch |
michael@0 | 34 | ; as this feature might be useful for others as well. Send patches or ideas |
michael@0 | 35 | ; to x264-devel@videolan.org . |
michael@0 | 36 | |
michael@0 | 37 | %include "vpx_config.asm" |
michael@0 | 38 | |
michael@0 | 39 | %define program_name vp9 |
michael@0 | 40 | |
michael@0 | 41 | |
michael@0 | 42 | %define UNIX64 0 |
michael@0 | 43 | %define WIN64 0 |
michael@0 | 44 | %if ARCH_X86_64 |
michael@0 | 45 | %ifidn __OUTPUT_FORMAT__,win32 |
michael@0 | 46 | %define WIN64 1 |
michael@0 | 47 | %elifidn __OUTPUT_FORMAT__,win64 |
michael@0 | 48 | %define WIN64 1 |
michael@0 | 49 | %elifidn __OUTPUT_FORMAT__,x64 |
michael@0 | 50 | %define WIN64 1 |
michael@0 | 51 | %else |
michael@0 | 52 | %define UNIX64 1 |
michael@0 | 53 | %endif |
michael@0 | 54 | %endif |
michael@0 | 55 | |
michael@0 | 56 | %ifidn __OUTPUT_FORMAT__,elf32 |
michael@0 | 57 | %define mangle(x) x |
michael@0 | 58 | %elifidn __OUTPUT_FORMAT__,elf64 |
michael@0 | 59 | %define mangle(x) x |
michael@0 | 60 | %elifidn __OUTPUT_FORMAT__,elf |
michael@0 | 61 | %define mangle(x) x |
michael@0 | 62 | %elifidn __OUTPUT_FORMAT__,x64 |
michael@0 | 63 | %define mangle(x) x |
michael@0 | 64 | %elifidn __OUTPUT_FORMAT__,win64 |
michael@0 | 65 | %define mangle(x) x |
michael@0 | 66 | %else |
michael@0 | 67 | %define mangle(x) _ %+ x |
michael@0 | 68 | %endif |
michael@0 | 69 | |
michael@0 | 70 | ; FIXME: All of the 64bit asm functions that take a stride as an argument |
michael@0 | 71 | ; via register, assume that the high dword of that register is filled with 0. |
michael@0 | 72 | ; This is true in practice (since we never do any 64bit arithmetic on strides, |
michael@0 | 73 | ; and x264's strides are all positive), but is not guaranteed by the ABI. |
michael@0 | 74 | |
michael@0 | 75 | ; Name of the .rodata section. |
michael@0 | 76 | ; Kludge: Something on OS X fails to align .rodata even given an align attribute, |
michael@0 | 77 | ; so use a different read-only section. |
michael@0 | 78 | %macro SECTION_RODATA 0-1 16 |
michael@0 | 79 | %ifidn __OUTPUT_FORMAT__,macho64 |
michael@0 | 80 | SECTION .text align=%1 |
michael@0 | 81 | %elifidn __OUTPUT_FORMAT__,macho |
michael@0 | 82 | SECTION .text align=%1 |
michael@0 | 83 | fakegot: |
michael@0 | 84 | %elifidn __OUTPUT_FORMAT__,aout |
michael@0 | 85 | section .text |
michael@0 | 86 | %else |
michael@0 | 87 | SECTION .rodata align=%1 |
michael@0 | 88 | %endif |
michael@0 | 89 | %endmacro |
michael@0 | 90 | |
michael@0 | 91 | ; aout does not support align= |
michael@0 | 92 | %macro SECTION_TEXT 0-1 16 |
michael@0 | 93 | %ifidn __OUTPUT_FORMAT__,aout |
michael@0 | 94 | SECTION .text |
michael@0 | 95 | %else |
michael@0 | 96 | SECTION .text align=%1 |
michael@0 | 97 | %endif |
michael@0 | 98 | %endmacro |
michael@0 | 99 | |
michael@0 | 100 | ; PIC macros are copied from vpx_ports/x86_abi_support.asm. The "define PIC" |
michael@0 | 101 | ; from original code is added in for 64bit. |
michael@0 | 102 | %ifidn __OUTPUT_FORMAT__,elf32 |
michael@0 | 103 | %define ABI_IS_32BIT 1 |
michael@0 | 104 | %elifidn __OUTPUT_FORMAT__,macho32 |
michael@0 | 105 | %define ABI_IS_32BIT 1 |
michael@0 | 106 | %elifidn __OUTPUT_FORMAT__,win32 |
michael@0 | 107 | %define ABI_IS_32BIT 1 |
michael@0 | 108 | %elifidn __OUTPUT_FORMAT__,aout |
michael@0 | 109 | %define ABI_IS_32BIT 1 |
michael@0 | 110 | %else |
michael@0 | 111 | %define ABI_IS_32BIT 0 |
michael@0 | 112 | %endif |
michael@0 | 113 | |
michael@0 | 114 | %if ABI_IS_32BIT |
michael@0 | 115 | %if CONFIG_PIC=1 |
michael@0 | 116 | %ifidn __OUTPUT_FORMAT__,elf32 |
michael@0 | 117 | %define GET_GOT_SAVE_ARG 1 |
michael@0 | 118 | %define WRT_PLT wrt ..plt |
michael@0 | 119 | %macro GET_GOT 1 |
michael@0 | 120 | extern _GLOBAL_OFFSET_TABLE_ |
michael@0 | 121 | push %1 |
michael@0 | 122 | call %%get_got |
michael@0 | 123 | %%sub_offset: |
michael@0 | 124 | jmp %%exitGG |
michael@0 | 125 | %%get_got: |
michael@0 | 126 | mov %1, [esp] |
michael@0 | 127 | add %1, _GLOBAL_OFFSET_TABLE_ + $$ - %%sub_offset wrt ..gotpc |
michael@0 | 128 | ret |
michael@0 | 129 | %%exitGG: |
michael@0 | 130 | %undef GLOBAL |
michael@0 | 131 | %define GLOBAL(x) x + %1 wrt ..gotoff |
michael@0 | 132 | %undef RESTORE_GOT |
michael@0 | 133 | %define RESTORE_GOT pop %1 |
michael@0 | 134 | %endmacro |
michael@0 | 135 | %elifidn __OUTPUT_FORMAT__,macho32 |
michael@0 | 136 | %define GET_GOT_SAVE_ARG 1 |
michael@0 | 137 | %macro GET_GOT 1 |
michael@0 | 138 | push %1 |
michael@0 | 139 | call %%get_got |
michael@0 | 140 | %%get_got: |
michael@0 | 141 | pop %1 |
michael@0 | 142 | %undef GLOBAL |
michael@0 | 143 | %define GLOBAL(x) x + %1 - %%get_got |
michael@0 | 144 | %undef RESTORE_GOT |
michael@0 | 145 | %define RESTORE_GOT pop %1 |
michael@0 | 146 | %endmacro |
michael@0 | 147 | %endif |
michael@0 | 148 | %endif |
michael@0 | 149 | |
michael@0 | 150 | %if ARCH_X86_64 == 0 |
michael@0 | 151 | %undef PIC |
michael@0 | 152 | %endif |
michael@0 | 153 | |
michael@0 | 154 | %else |
michael@0 | 155 | %macro GET_GOT 1 |
michael@0 | 156 | %endmacro |
michael@0 | 157 | %define GLOBAL(x) rel x |
michael@0 | 158 | %define WRT_PLT wrt ..plt |
michael@0 | 159 | |
michael@0 | 160 | %if WIN64 |
michael@0 | 161 | %define PIC |
michael@0 | 162 | %elifidn __OUTPUT_FORMAT__,macho64 |
michael@0 | 163 | %define PIC |
michael@0 | 164 | %elif CONFIG_PIC |
michael@0 | 165 | %define PIC |
michael@0 | 166 | %endif |
michael@0 | 167 | %endif |
michael@0 | 168 | |
michael@0 | 169 | %ifnmacro GET_GOT |
michael@0 | 170 | %macro GET_GOT 1 |
michael@0 | 171 | %endmacro |
michael@0 | 172 | %define GLOBAL(x) x |
michael@0 | 173 | %endif |
michael@0 | 174 | %ifndef RESTORE_GOT |
michael@0 | 175 | %define RESTORE_GOT |
michael@0 | 176 | %endif |
michael@0 | 177 | %ifndef WRT_PLT |
michael@0 | 178 | %define WRT_PLT |
michael@0 | 179 | %endif |
michael@0 | 180 | |
michael@0 | 181 | %ifdef PIC |
michael@0 | 182 | default rel |
michael@0 | 183 | %endif |
michael@0 | 184 | ; Done with PIC macros |
michael@0 | 185 | |
michael@0 | 186 | ; Always use long nops (reduces 0x90 spam in disassembly on x86_32) |
michael@0 | 187 | %ifndef __NASM_VER__ |
michael@0 | 188 | CPU amdnop |
michael@0 | 189 | %else |
michael@0 | 190 | %use smartalign |
michael@0 | 191 | ALIGNMODE k7 |
michael@0 | 192 | %endif |
michael@0 | 193 | |
michael@0 | 194 | ; Macros to eliminate most code duplication between x86_32 and x86_64: |
michael@0 | 195 | ; Currently this works only for leaf functions which load all their arguments |
michael@0 | 196 | ; into registers at the start, and make no other use of the stack. Luckily that |
michael@0 | 197 | ; covers most of x264's asm. |
michael@0 | 198 | |
michael@0 | 199 | ; PROLOGUE: |
michael@0 | 200 | ; %1 = number of arguments. loads them from stack if needed. |
michael@0 | 201 | ; %2 = number of registers used. pushes callee-saved regs if needed. |
michael@0 | 202 | ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. |
michael@0 | 203 | ; %4 = list of names to define to registers |
michael@0 | 204 | ; PROLOGUE can also be invoked by adding the same options to cglobal |
michael@0 | 205 | |
michael@0 | 206 | ; e.g. |
michael@0 | 207 | ; cglobal foo, 2,3,0, dst, src, tmp |
michael@0 | 208 | ; declares a function (foo), taking two args (dst and src) and one local variable (tmp) |
michael@0 | 209 | |
michael@0 | 210 | ; TODO Some functions can use some args directly from the stack. If they're the |
michael@0 | 211 | ; last args then you can just not declare them, but if they're in the middle |
michael@0 | 212 | ; we need more flexible macro. |
michael@0 | 213 | |
michael@0 | 214 | ; RET: |
michael@0 | 215 | ; Pops anything that was pushed by PROLOGUE, and returns. |
michael@0 | 216 | |
michael@0 | 217 | ; REP_RET: |
michael@0 | 218 | ; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons |
michael@0 | 219 | ; which are slow when a normal ret follows a branch. |
michael@0 | 220 | |
michael@0 | 221 | ; registers: |
michael@0 | 222 | ; rN and rNq are the native-size register holding function argument N |
michael@0 | 223 | ; rNd, rNw, rNb are dword, word, and byte size |
michael@0 | 224 | ; rNm is the original location of arg N (a register or on the stack), dword |
michael@0 | 225 | ; rNmp is native size |
michael@0 | 226 | |
michael@0 | 227 | %macro DECLARE_REG 5-6 |
michael@0 | 228 | %define r%1q %2 |
michael@0 | 229 | %define r%1d %3 |
michael@0 | 230 | %define r%1w %4 |
michael@0 | 231 | %define r%1b %5 |
michael@0 | 232 | %if %0 == 5 |
michael@0 | 233 | %define r%1m %3 |
michael@0 | 234 | %define r%1mp %2 |
michael@0 | 235 | %elif ARCH_X86_64 ; memory |
michael@0 | 236 | %define r%1m [rsp + stack_offset + %6] |
michael@0 | 237 | %define r%1mp qword r %+ %1m |
michael@0 | 238 | %else |
michael@0 | 239 | %define r%1m [esp + stack_offset + %6] |
michael@0 | 240 | %define r%1mp dword r %+ %1m |
michael@0 | 241 | %endif |
michael@0 | 242 | %define r%1 %2 |
michael@0 | 243 | %endmacro |
michael@0 | 244 | |
michael@0 | 245 | %macro DECLARE_REG_SIZE 2 |
michael@0 | 246 | %define r%1q r%1 |
michael@0 | 247 | %define e%1q r%1 |
michael@0 | 248 | %define r%1d e%1 |
michael@0 | 249 | %define e%1d e%1 |
michael@0 | 250 | %define r%1w %1 |
michael@0 | 251 | %define e%1w %1 |
michael@0 | 252 | %define r%1b %2 |
michael@0 | 253 | %define e%1b %2 |
michael@0 | 254 | %if ARCH_X86_64 == 0 |
michael@0 | 255 | %define r%1 e%1 |
michael@0 | 256 | %endif |
michael@0 | 257 | %endmacro |
michael@0 | 258 | |
michael@0 | 259 | DECLARE_REG_SIZE ax, al |
michael@0 | 260 | DECLARE_REG_SIZE bx, bl |
michael@0 | 261 | DECLARE_REG_SIZE cx, cl |
michael@0 | 262 | DECLARE_REG_SIZE dx, dl |
michael@0 | 263 | DECLARE_REG_SIZE si, sil |
michael@0 | 264 | DECLARE_REG_SIZE di, dil |
michael@0 | 265 | DECLARE_REG_SIZE bp, bpl |
michael@0 | 266 | |
michael@0 | 267 | ; t# defines for when per-arch register allocation is more complex than just function arguments |
michael@0 | 268 | |
michael@0 | 269 | %macro DECLARE_REG_TMP 1-* |
michael@0 | 270 | %assign %%i 0 |
michael@0 | 271 | %rep %0 |
michael@0 | 272 | CAT_XDEFINE t, %%i, r%1 |
michael@0 | 273 | %assign %%i %%i+1 |
michael@0 | 274 | %rotate 1 |
michael@0 | 275 | %endrep |
michael@0 | 276 | %endmacro |
michael@0 | 277 | |
michael@0 | 278 | %macro DECLARE_REG_TMP_SIZE 0-* |
michael@0 | 279 | %rep %0 |
michael@0 | 280 | %define t%1q t%1 %+ q |
michael@0 | 281 | %define t%1d t%1 %+ d |
michael@0 | 282 | %define t%1w t%1 %+ w |
michael@0 | 283 | %define t%1b t%1 %+ b |
michael@0 | 284 | %rotate 1 |
michael@0 | 285 | %endrep |
michael@0 | 286 | %endmacro |
michael@0 | 287 | |
michael@0 | 288 | DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 |
michael@0 | 289 | |
michael@0 | 290 | %if ARCH_X86_64 |
michael@0 | 291 | %define gprsize 8 |
michael@0 | 292 | %else |
michael@0 | 293 | %define gprsize 4 |
michael@0 | 294 | %endif |
michael@0 | 295 | |
michael@0 | 296 | %macro PUSH 1 |
michael@0 | 297 | push %1 |
michael@0 | 298 | %assign stack_offset stack_offset+gprsize |
michael@0 | 299 | %endmacro |
michael@0 | 300 | |
michael@0 | 301 | %macro POP 1 |
michael@0 | 302 | pop %1 |
michael@0 | 303 | %assign stack_offset stack_offset-gprsize |
michael@0 | 304 | %endmacro |
michael@0 | 305 | |
michael@0 | 306 | %macro PUSH_IF_USED 1-* |
michael@0 | 307 | %rep %0 |
michael@0 | 308 | %if %1 < regs_used |
michael@0 | 309 | PUSH r%1 |
michael@0 | 310 | %endif |
michael@0 | 311 | %rotate 1 |
michael@0 | 312 | %endrep |
michael@0 | 313 | %endmacro |
michael@0 | 314 | |
michael@0 | 315 | %macro POP_IF_USED 1-* |
michael@0 | 316 | %rep %0 |
michael@0 | 317 | %if %1 < regs_used |
michael@0 | 318 | pop r%1 |
michael@0 | 319 | %endif |
michael@0 | 320 | %rotate 1 |
michael@0 | 321 | %endrep |
michael@0 | 322 | %endmacro |
michael@0 | 323 | |
michael@0 | 324 | %macro LOAD_IF_USED 1-* |
michael@0 | 325 | %rep %0 |
michael@0 | 326 | %if %1 < num_args |
michael@0 | 327 | mov r%1, r %+ %1 %+ mp |
michael@0 | 328 | %endif |
michael@0 | 329 | %rotate 1 |
michael@0 | 330 | %endrep |
michael@0 | 331 | %endmacro |
michael@0 | 332 | |
michael@0 | 333 | %macro SUB 2 |
michael@0 | 334 | sub %1, %2 |
michael@0 | 335 | %ifidn %1, rsp |
michael@0 | 336 | %assign stack_offset stack_offset+(%2) |
michael@0 | 337 | %endif |
michael@0 | 338 | %endmacro |
michael@0 | 339 | |
michael@0 | 340 | %macro ADD 2 |
michael@0 | 341 | add %1, %2 |
michael@0 | 342 | %ifidn %1, rsp |
michael@0 | 343 | %assign stack_offset stack_offset-(%2) |
michael@0 | 344 | %endif |
michael@0 | 345 | %endmacro |
michael@0 | 346 | |
michael@0 | 347 | %macro movifnidn 2 |
michael@0 | 348 | %ifnidn %1, %2 |
michael@0 | 349 | mov %1, %2 |
michael@0 | 350 | %endif |
michael@0 | 351 | %endmacro |
michael@0 | 352 | |
michael@0 | 353 | %macro movsxdifnidn 2 |
michael@0 | 354 | %ifnidn %1, %2 |
michael@0 | 355 | movsxd %1, %2 |
michael@0 | 356 | %endif |
michael@0 | 357 | %endmacro |
michael@0 | 358 | |
michael@0 | 359 | %macro ASSERT 1 |
michael@0 | 360 | %if (%1) == 0 |
michael@0 | 361 | %error assert failed |
michael@0 | 362 | %endif |
michael@0 | 363 | %endmacro |
michael@0 | 364 | |
michael@0 | 365 | %macro DEFINE_ARGS 0-* |
michael@0 | 366 | %ifdef n_arg_names |
michael@0 | 367 | %assign %%i 0 |
michael@0 | 368 | %rep n_arg_names |
michael@0 | 369 | CAT_UNDEF arg_name %+ %%i, q |
michael@0 | 370 | CAT_UNDEF arg_name %+ %%i, d |
michael@0 | 371 | CAT_UNDEF arg_name %+ %%i, w |
michael@0 | 372 | CAT_UNDEF arg_name %+ %%i, b |
michael@0 | 373 | CAT_UNDEF arg_name %+ %%i, m |
michael@0 | 374 | CAT_UNDEF arg_name %+ %%i, mp |
michael@0 | 375 | CAT_UNDEF arg_name, %%i |
michael@0 | 376 | %assign %%i %%i+1 |
michael@0 | 377 | %endrep |
michael@0 | 378 | %endif |
michael@0 | 379 | |
michael@0 | 380 | %xdefine %%stack_offset stack_offset |
michael@0 | 381 | %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine |
michael@0 | 382 | %assign %%i 0 |
michael@0 | 383 | %rep %0 |
michael@0 | 384 | %xdefine %1q r %+ %%i %+ q |
michael@0 | 385 | %xdefine %1d r %+ %%i %+ d |
michael@0 | 386 | %xdefine %1w r %+ %%i %+ w |
michael@0 | 387 | %xdefine %1b r %+ %%i %+ b |
michael@0 | 388 | %xdefine %1m r %+ %%i %+ m |
michael@0 | 389 | %xdefine %1mp r %+ %%i %+ mp |
michael@0 | 390 | CAT_XDEFINE arg_name, %%i, %1 |
michael@0 | 391 | %assign %%i %%i+1 |
michael@0 | 392 | %rotate 1 |
michael@0 | 393 | %endrep |
michael@0 | 394 | %xdefine stack_offset %%stack_offset |
michael@0 | 395 | %assign n_arg_names %0 |
michael@0 | 396 | %endmacro |
michael@0 | 397 | |
michael@0 | 398 | %if WIN64 ; Windows x64 ;================================================= |
michael@0 | 399 | |
michael@0 | 400 | DECLARE_REG 0, rcx, ecx, cx, cl |
michael@0 | 401 | DECLARE_REG 1, rdx, edx, dx, dl |
michael@0 | 402 | DECLARE_REG 2, R8, R8D, R8W, R8B |
michael@0 | 403 | DECLARE_REG 3, R9, R9D, R9W, R9B |
michael@0 | 404 | DECLARE_REG 4, R10, R10D, R10W, R10B, 40 |
michael@0 | 405 | DECLARE_REG 5, R11, R11D, R11W, R11B, 48 |
michael@0 | 406 | DECLARE_REG 6, rax, eax, ax, al, 56 |
michael@0 | 407 | DECLARE_REG 7, rdi, edi, di, dil, 64 |
michael@0 | 408 | DECLARE_REG 8, rsi, esi, si, sil, 72 |
michael@0 | 409 | DECLARE_REG 9, rbx, ebx, bx, bl, 80 |
michael@0 | 410 | DECLARE_REG 10, rbp, ebp, bp, bpl, 88 |
michael@0 | 411 | DECLARE_REG 11, R12, R12D, R12W, R12B, 96 |
michael@0 | 412 | DECLARE_REG 12, R13, R13D, R13W, R13B, 104 |
michael@0 | 413 | DECLARE_REG 13, R14, R14D, R14W, R14B, 112 |
michael@0 | 414 | DECLARE_REG 14, R15, R15D, R15W, R15B, 120 |
michael@0 | 415 | |
michael@0 | 416 | %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... |
michael@0 | 417 | %assign num_args %1 |
michael@0 | 418 | %assign regs_used %2 |
michael@0 | 419 | ASSERT regs_used >= num_args |
michael@0 | 420 | ASSERT regs_used <= 15 |
michael@0 | 421 | PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 |
michael@0 | 422 | %if mmsize == 8 |
michael@0 | 423 | %assign xmm_regs_used 0 |
michael@0 | 424 | %else |
michael@0 | 425 | WIN64_SPILL_XMM %3 |
michael@0 | 426 | %endif |
michael@0 | 427 | LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 |
michael@0 | 428 | DEFINE_ARGS %4 |
michael@0 | 429 | %endmacro |
michael@0 | 430 | |
michael@0 | 431 | %macro WIN64_SPILL_XMM 1 |
michael@0 | 432 | %assign xmm_regs_used %1 |
michael@0 | 433 | ASSERT xmm_regs_used <= 16 |
michael@0 | 434 | %if xmm_regs_used > 6 |
michael@0 | 435 | SUB rsp, (xmm_regs_used-6)*16+16 |
michael@0 | 436 | %assign %%i xmm_regs_used |
michael@0 | 437 | %rep (xmm_regs_used-6) |
michael@0 | 438 | %assign %%i %%i-1 |
michael@0 | 439 | movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i |
michael@0 | 440 | %endrep |
michael@0 | 441 | %endif |
michael@0 | 442 | %endmacro |
michael@0 | 443 | |
michael@0 | 444 | %macro WIN64_RESTORE_XMM_INTERNAL 1 |
michael@0 | 445 | %if xmm_regs_used > 6 |
michael@0 | 446 | %assign %%i xmm_regs_used |
michael@0 | 447 | %rep (xmm_regs_used-6) |
michael@0 | 448 | %assign %%i %%i-1 |
michael@0 | 449 | movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] |
michael@0 | 450 | %endrep |
michael@0 | 451 | add %1, (xmm_regs_used-6)*16+16 |
michael@0 | 452 | %endif |
michael@0 | 453 | %endmacro |
michael@0 | 454 | |
michael@0 | 455 | %macro WIN64_RESTORE_XMM 1 |
michael@0 | 456 | WIN64_RESTORE_XMM_INTERNAL %1 |
michael@0 | 457 | %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 |
michael@0 | 458 | %assign xmm_regs_used 0 |
michael@0 | 459 | %endmacro |
michael@0 | 460 | |
michael@0 | 461 | %macro RET 0 |
michael@0 | 462 | WIN64_RESTORE_XMM_INTERNAL rsp |
michael@0 | 463 | POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7 |
michael@0 | 464 | ret |
michael@0 | 465 | %endmacro |
michael@0 | 466 | |
michael@0 | 467 | %macro REP_RET 0 |
michael@0 | 468 | %if regs_used > 7 || xmm_regs_used > 6 |
michael@0 | 469 | RET |
michael@0 | 470 | %else |
michael@0 | 471 | rep ret |
michael@0 | 472 | %endif |
michael@0 | 473 | %endmacro |
michael@0 | 474 | |
michael@0 | 475 | %elif ARCH_X86_64 ; *nix x64 ;============================================= |
michael@0 | 476 | |
michael@0 | 477 | DECLARE_REG 0, rdi, edi, di, dil |
michael@0 | 478 | DECLARE_REG 1, rsi, esi, si, sil |
michael@0 | 479 | DECLARE_REG 2, rdx, edx, dx, dl |
michael@0 | 480 | DECLARE_REG 3, rcx, ecx, cx, cl |
michael@0 | 481 | DECLARE_REG 4, R8, R8D, R8W, R8B |
michael@0 | 482 | DECLARE_REG 5, R9, R9D, R9W, R9B |
michael@0 | 483 | DECLARE_REG 6, rax, eax, ax, al, 8 |
michael@0 | 484 | DECLARE_REG 7, R10, R10D, R10W, R10B, 16 |
michael@0 | 485 | DECLARE_REG 8, R11, R11D, R11W, R11B, 24 |
michael@0 | 486 | DECLARE_REG 9, rbx, ebx, bx, bl, 32 |
michael@0 | 487 | DECLARE_REG 10, rbp, ebp, bp, bpl, 40 |
michael@0 | 488 | DECLARE_REG 11, R12, R12D, R12W, R12B, 48 |
michael@0 | 489 | DECLARE_REG 12, R13, R13D, R13W, R13B, 56 |
michael@0 | 490 | DECLARE_REG 13, R14, R14D, R14W, R14B, 64 |
michael@0 | 491 | DECLARE_REG 14, R15, R15D, R15W, R15B, 72 |
michael@0 | 492 | |
michael@0 | 493 | %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... |
michael@0 | 494 | %assign num_args %1 |
michael@0 | 495 | %assign regs_used %2 |
michael@0 | 496 | ASSERT regs_used >= num_args |
michael@0 | 497 | ASSERT regs_used <= 15 |
michael@0 | 498 | PUSH_IF_USED 9, 10, 11, 12, 13, 14 |
michael@0 | 499 | LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 |
michael@0 | 500 | DEFINE_ARGS %4 |
michael@0 | 501 | %endmacro |
michael@0 | 502 | |
michael@0 | 503 | %macro RET 0 |
michael@0 | 504 | POP_IF_USED 14, 13, 12, 11, 10, 9 |
michael@0 | 505 | ret |
michael@0 | 506 | %endmacro |
michael@0 | 507 | |
michael@0 | 508 | %macro REP_RET 0 |
michael@0 | 509 | %if regs_used > 9 |
michael@0 | 510 | RET |
michael@0 | 511 | %else |
michael@0 | 512 | rep ret |
michael@0 | 513 | %endif |
michael@0 | 514 | %endmacro |
michael@0 | 515 | |
michael@0 | 516 | %else ; X86_32 ;============================================================== |
michael@0 | 517 | |
michael@0 | 518 | DECLARE_REG 0, eax, eax, ax, al, 4 |
michael@0 | 519 | DECLARE_REG 1, ecx, ecx, cx, cl, 8 |
michael@0 | 520 | DECLARE_REG 2, edx, edx, dx, dl, 12 |
michael@0 | 521 | DECLARE_REG 3, ebx, ebx, bx, bl, 16 |
michael@0 | 522 | DECLARE_REG 4, esi, esi, si, null, 20 |
michael@0 | 523 | DECLARE_REG 5, edi, edi, di, null, 24 |
michael@0 | 524 | DECLARE_REG 6, ebp, ebp, bp, null, 28 |
michael@0 | 525 | %define rsp esp |
michael@0 | 526 | |
michael@0 | 527 | %macro DECLARE_ARG 1-* |
michael@0 | 528 | %rep %0 |
michael@0 | 529 | %define r%1m [esp + stack_offset + 4*%1 + 4] |
michael@0 | 530 | %define r%1mp dword r%1m |
michael@0 | 531 | %rotate 1 |
michael@0 | 532 | %endrep |
michael@0 | 533 | %endmacro |
michael@0 | 534 | |
michael@0 | 535 | DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 |
michael@0 | 536 | |
michael@0 | 537 | %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... |
michael@0 | 538 | %assign num_args %1 |
michael@0 | 539 | %assign regs_used %2 |
michael@0 | 540 | %if regs_used > 7 |
michael@0 | 541 | %assign regs_used 7 |
michael@0 | 542 | %endif |
michael@0 | 543 | ASSERT regs_used >= num_args |
michael@0 | 544 | PUSH_IF_USED 3, 4, 5, 6 |
michael@0 | 545 | LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 |
michael@0 | 546 | DEFINE_ARGS %4 |
michael@0 | 547 | %endmacro |
michael@0 | 548 | |
michael@0 | 549 | %macro RET 0 |
michael@0 | 550 | POP_IF_USED 6, 5, 4, 3 |
michael@0 | 551 | ret |
michael@0 | 552 | %endmacro |
michael@0 | 553 | |
michael@0 | 554 | %macro REP_RET 0 |
michael@0 | 555 | %if regs_used > 3 |
michael@0 | 556 | RET |
michael@0 | 557 | %else |
michael@0 | 558 | rep ret |
michael@0 | 559 | %endif |
michael@0 | 560 | %endmacro |
michael@0 | 561 | |
michael@0 | 562 | %endif ;====================================================================== |
michael@0 | 563 | |
michael@0 | 564 | %if WIN64 == 0 |
michael@0 | 565 | %macro WIN64_SPILL_XMM 1 |
michael@0 | 566 | %endmacro |
michael@0 | 567 | %macro WIN64_RESTORE_XMM 1 |
michael@0 | 568 | %endmacro |
michael@0 | 569 | %endif |
michael@0 | 570 | |
michael@0 | 571 | ;============================================================================= |
michael@0 | 572 | ; arch-independent part |
michael@0 | 573 | ;============================================================================= |
michael@0 | 574 | |
michael@0 | 575 | %assign function_align 16 |
michael@0 | 576 | |
michael@0 | 577 | ; Begin a function. |
michael@0 | 578 | ; Applies any symbol mangling needed for C linkage, and sets up a define such that |
michael@0 | 579 | ; subsequent uses of the function name automatically refer to the mangled version. |
michael@0 | 580 | ; Appends cpuflags to the function name if cpuflags has been specified. |
michael@0 | 581 | %macro cglobal 1-2+ ; name, [PROLOGUE args] |
michael@0 | 582 | %if %0 == 1 |
michael@0 | 583 | cglobal_internal %1 %+ SUFFIX |
michael@0 | 584 | %else |
michael@0 | 585 | cglobal_internal %1 %+ SUFFIX, %2 |
michael@0 | 586 | %endif |
michael@0 | 587 | %endmacro |
michael@0 | 588 | %macro cglobal_internal 1-2+ |
michael@0 | 589 | %ifndef cglobaled_%1 |
michael@0 | 590 | %xdefine %1 mangle(program_name %+ _ %+ %1) |
michael@0 | 591 | %xdefine %1.skip_prologue %1 %+ .skip_prologue |
michael@0 | 592 | CAT_XDEFINE cglobaled_, %1, 1 |
michael@0 | 593 | %endif |
michael@0 | 594 | %xdefine current_function %1 |
michael@0 | 595 | %ifidn __OUTPUT_FORMAT__,elf |
michael@0 | 596 | global %1:function hidden |
michael@0 | 597 | %elifidn __OUTPUT_FORMAT__,elf32 |
michael@0 | 598 | global %1:function hidden |
michael@0 | 599 | %elifidn __OUTPUT_FORMAT__,elf64 |
michael@0 | 600 | global %1:function hidden |
michael@0 | 601 | %elifidn __OUTPUT_FORMAT__,macho32 |
michael@0 | 602 | global %1:private_extern |
michael@0 | 603 | %elifidn __OUTPUT_FORMAT__,macho64 |
michael@0 | 604 | global %1:private_extern |
michael@0 | 605 | %else |
michael@0 | 606 | global %1 |
michael@0 | 607 | %endif |
michael@0 | 608 | align function_align |
michael@0 | 609 | %1: |
michael@0 | 610 | RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer |
michael@0 | 611 | %assign stack_offset 0 |
michael@0 | 612 | %if %0 > 1 |
michael@0 | 613 | PROLOGUE %2 |
michael@0 | 614 | %endif |
michael@0 | 615 | %endmacro |
michael@0 | 616 | |
michael@0 | 617 | %macro cextern 1 |
michael@0 | 618 | %xdefine %1 mangle(program_name %+ _ %+ %1) |
michael@0 | 619 | CAT_XDEFINE cglobaled_, %1, 1 |
michael@0 | 620 | extern %1 |
michael@0 | 621 | %endmacro |
michael@0 | 622 | |
michael@0 | 623 | ; like cextern, but without the prefix |
michael@0 | 624 | %macro cextern_naked 1 |
michael@0 | 625 | %xdefine %1 mangle(%1) |
michael@0 | 626 | CAT_XDEFINE cglobaled_, %1, 1 |
michael@0 | 627 | extern %1 |
michael@0 | 628 | %endmacro |
michael@0 | 629 | |
michael@0 | 630 | %macro const 2+ |
michael@0 | 631 | %xdefine %1 mangle(program_name %+ _ %+ %1) |
michael@0 | 632 | global %1 |
michael@0 | 633 | %1: %2 |
michael@0 | 634 | %endmacro |
michael@0 | 635 | |
michael@0 | 636 | ; This is needed for ELF, otherwise the GNU linker assumes the stack is |
michael@0 | 637 | ; executable by default. |
michael@0 | 638 | %ifidn __OUTPUT_FORMAT__,elf |
michael@0 | 639 | SECTION .note.GNU-stack noalloc noexec nowrite progbits |
michael@0 | 640 | %elifidn __OUTPUT_FORMAT__,elf32 |
michael@0 | 641 | SECTION .note.GNU-stack noalloc noexec nowrite progbits |
michael@0 | 642 | %elifidn __OUTPUT_FORMAT__,elf64 |
michael@0 | 643 | SECTION .note.GNU-stack noalloc noexec nowrite progbits |
michael@0 | 644 | %endif |
michael@0 | 645 | |
michael@0 | 646 | ; cpuflags |
michael@0 | 647 | |
michael@0 | 648 | %assign cpuflags_mmx (1<<0) |
michael@0 | 649 | %assign cpuflags_mmx2 (1<<1) | cpuflags_mmx |
michael@0 | 650 | %assign cpuflags_3dnow (1<<2) | cpuflags_mmx |
michael@0 | 651 | %assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow |
michael@0 | 652 | %assign cpuflags_sse (1<<4) | cpuflags_mmx2 |
michael@0 | 653 | %assign cpuflags_sse2 (1<<5) | cpuflags_sse |
michael@0 | 654 | %assign cpuflags_sse2slow (1<<6) | cpuflags_sse2 |
michael@0 | 655 | %assign cpuflags_sse3 (1<<7) | cpuflags_sse2 |
michael@0 | 656 | %assign cpuflags_ssse3 (1<<8) | cpuflags_sse3 |
michael@0 | 657 | %assign cpuflags_sse4 (1<<9) | cpuflags_ssse3 |
michael@0 | 658 | %assign cpuflags_sse42 (1<<10)| cpuflags_sse4 |
michael@0 | 659 | %assign cpuflags_avx (1<<11)| cpuflags_sse42 |
michael@0 | 660 | %assign cpuflags_xop (1<<12)| cpuflags_avx |
michael@0 | 661 | %assign cpuflags_fma4 (1<<13)| cpuflags_avx |
michael@0 | 662 | |
michael@0 | 663 | %assign cpuflags_cache32 (1<<16) |
michael@0 | 664 | %assign cpuflags_cache64 (1<<17) |
michael@0 | 665 | %assign cpuflags_slowctz (1<<18) |
michael@0 | 666 | %assign cpuflags_lzcnt (1<<19) |
michael@0 | 667 | %assign cpuflags_misalign (1<<20) |
michael@0 | 668 | %assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant |
michael@0 | 669 | %assign cpuflags_atom (1<<22) |
michael@0 | 670 | |
michael@0 | 671 | %define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x)) |
michael@0 | 672 | %define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x)) |
michael@0 | 673 | |
michael@0 | 674 | ; Takes up to 2 cpuflags from the above list. |
michael@0 | 675 | ; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu. |
michael@0 | 676 | ; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co. |
michael@0 | 677 | %macro INIT_CPUFLAGS 0-2 |
michael@0 | 678 | %if %0 >= 1 |
michael@0 | 679 | %xdefine cpuname %1 |
michael@0 | 680 | %assign cpuflags cpuflags_%1 |
michael@0 | 681 | %if %0 >= 2 |
michael@0 | 682 | %xdefine cpuname %1_%2 |
michael@0 | 683 | %assign cpuflags cpuflags | cpuflags_%2 |
michael@0 | 684 | %endif |
michael@0 | 685 | %xdefine SUFFIX _ %+ cpuname |
michael@0 | 686 | %if cpuflag(avx) |
michael@0 | 687 | %assign avx_enabled 1 |
michael@0 | 688 | %endif |
michael@0 | 689 | %if mmsize == 16 && notcpuflag(sse2) |
michael@0 | 690 | %define mova movaps |
michael@0 | 691 | %define movu movups |
michael@0 | 692 | %define movnta movntps |
michael@0 | 693 | %endif |
michael@0 | 694 | %if cpuflag(aligned) |
michael@0 | 695 | %define movu mova |
michael@0 | 696 | %elifidn %1, sse3 |
michael@0 | 697 | %define movu lddqu |
michael@0 | 698 | %endif |
michael@0 | 699 | %else |
michael@0 | 700 | %xdefine SUFFIX |
michael@0 | 701 | %undef cpuname |
michael@0 | 702 | %undef cpuflags |
michael@0 | 703 | %endif |
michael@0 | 704 | %endmacro |
michael@0 | 705 | |
michael@0 | 706 | ; merge mmx and sse* |
michael@0 | 707 | |
michael@0 | 708 | %macro CAT_XDEFINE 3 |
michael@0 | 709 | %xdefine %1%2 %3 |
michael@0 | 710 | %endmacro |
michael@0 | 711 | |
michael@0 | 712 | %macro CAT_UNDEF 2 |
michael@0 | 713 | %undef %1%2 |
michael@0 | 714 | %endmacro |
michael@0 | 715 | |
michael@0 | 716 | %macro INIT_MMX 0-1+ |
michael@0 | 717 | %assign avx_enabled 0 |
michael@0 | 718 | %define RESET_MM_PERMUTATION INIT_MMX %1 |
michael@0 | 719 | %define mmsize 8 |
michael@0 | 720 | %define num_mmregs 8 |
michael@0 | 721 | %define mova movq |
michael@0 | 722 | %define movu movq |
michael@0 | 723 | %define movh movd |
michael@0 | 724 | %define movnta movntq |
michael@0 | 725 | %assign %%i 0 |
michael@0 | 726 | %rep 8 |
michael@0 | 727 | CAT_XDEFINE m, %%i, mm %+ %%i |
michael@0 | 728 | CAT_XDEFINE nmm, %%i, %%i |
michael@0 | 729 | %assign %%i %%i+1 |
michael@0 | 730 | %endrep |
michael@0 | 731 | %rep 8 |
michael@0 | 732 | CAT_UNDEF m, %%i |
michael@0 | 733 | CAT_UNDEF nmm, %%i |
michael@0 | 734 | %assign %%i %%i+1 |
michael@0 | 735 | %endrep |
michael@0 | 736 | INIT_CPUFLAGS %1 |
michael@0 | 737 | %endmacro |
michael@0 | 738 | |
michael@0 | 739 | %macro INIT_XMM 0-1+ |
michael@0 | 740 | %assign avx_enabled 0 |
michael@0 | 741 | %define RESET_MM_PERMUTATION INIT_XMM %1 |
michael@0 | 742 | %define mmsize 16 |
michael@0 | 743 | %define num_mmregs 8 |
michael@0 | 744 | %if ARCH_X86_64 |
michael@0 | 745 | %define num_mmregs 16 |
michael@0 | 746 | %endif |
michael@0 | 747 | %define mova movdqa |
michael@0 | 748 | %define movu movdqu |
michael@0 | 749 | %define movh movq |
michael@0 | 750 | %define movnta movntdq |
michael@0 | 751 | %assign %%i 0 |
michael@0 | 752 | %rep num_mmregs |
michael@0 | 753 | CAT_XDEFINE m, %%i, xmm %+ %%i |
michael@0 | 754 | CAT_XDEFINE nxmm, %%i, %%i |
michael@0 | 755 | %assign %%i %%i+1 |
michael@0 | 756 | %endrep |
michael@0 | 757 | INIT_CPUFLAGS %1 |
michael@0 | 758 | %endmacro |
michael@0 | 759 | |
michael@0 | 760 | ; FIXME: INIT_AVX can be replaced by INIT_XMM avx |
michael@0 | 761 | %macro INIT_AVX 0 |
michael@0 | 762 | INIT_XMM |
michael@0 | 763 | %assign avx_enabled 1 |
michael@0 | 764 | %define PALIGNR PALIGNR_SSSE3 |
michael@0 | 765 | %define RESET_MM_PERMUTATION INIT_AVX |
michael@0 | 766 | %endmacro |
michael@0 | 767 | |
michael@0 | 768 | %macro INIT_YMM 0-1+ |
michael@0 | 769 | %assign avx_enabled 1 |
michael@0 | 770 | %define RESET_MM_PERMUTATION INIT_YMM %1 |
michael@0 | 771 | %define mmsize 32 |
michael@0 | 772 | %define num_mmregs 8 |
michael@0 | 773 | %if ARCH_X86_64 |
michael@0 | 774 | %define num_mmregs 16 |
michael@0 | 775 | %endif |
michael@0 | 776 | %define mova vmovaps |
michael@0 | 777 | %define movu vmovups |
michael@0 | 778 | %undef movh |
michael@0 | 779 | %define movnta vmovntps |
michael@0 | 780 | %assign %%i 0 |
michael@0 | 781 | %rep num_mmregs |
michael@0 | 782 | CAT_XDEFINE m, %%i, ymm %+ %%i |
michael@0 | 783 | CAT_XDEFINE nymm, %%i, %%i |
michael@0 | 784 | %assign %%i %%i+1 |
michael@0 | 785 | %endrep |
michael@0 | 786 | INIT_CPUFLAGS %1 |
michael@0 | 787 | %endmacro |
michael@0 | 788 | |
michael@0 | 789 | INIT_XMM |
michael@0 | 790 | |
michael@0 | 791 | ; I often want to use macros that permute their arguments. e.g. there's no |
michael@0 | 792 | ; efficient way to implement butterfly or transpose or dct without swapping some |
michael@0 | 793 | ; arguments. |
michael@0 | 794 | ; |
michael@0 | 795 | ; I would like to not have to manually keep track of the permutations: |
michael@0 | 796 | ; If I insert a permutation in the middle of a function, it should automatically |
michael@0 | 797 | ; change everything that follows. For more complex macros I may also have multiple |
michael@0 | 798 | ; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations. |
michael@0 | 799 | ; |
michael@0 | 800 | ; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that |
michael@0 | 801 | ; permutes its arguments. It's equivalent to exchanging the contents of the |
michael@0 | 802 | ; registers, except that this way you exchange the register names instead, so it |
michael@0 | 803 | ; doesn't cost any cycles. |
michael@0 | 804 | |
michael@0 | 805 | %macro PERMUTE 2-* ; takes a list of pairs to swap |
michael@0 | 806 | %rep %0/2 |
michael@0 | 807 | %xdefine tmp%2 m%2 |
michael@0 | 808 | %xdefine ntmp%2 nm%2 |
michael@0 | 809 | %rotate 2 |
michael@0 | 810 | %endrep |
michael@0 | 811 | %rep %0/2 |
michael@0 | 812 | %xdefine m%1 tmp%2 |
michael@0 | 813 | %xdefine nm%1 ntmp%2 |
michael@0 | 814 | %undef tmp%2 |
michael@0 | 815 | %undef ntmp%2 |
michael@0 | 816 | %rotate 2 |
michael@0 | 817 | %endrep |
michael@0 | 818 | %endmacro |
michael@0 | 819 | |
michael@0 | 820 | %macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs) |
michael@0 | 821 | %rep %0-1 |
michael@0 | 822 | %ifdef m%1 |
michael@0 | 823 | %xdefine tmp m%1 |
michael@0 | 824 | %xdefine m%1 m%2 |
michael@0 | 825 | %xdefine m%2 tmp |
michael@0 | 826 | CAT_XDEFINE n, m%1, %1 |
michael@0 | 827 | CAT_XDEFINE n, m%2, %2 |
michael@0 | 828 | %else |
michael@0 | 829 | ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here. |
michael@0 | 830 | ; Be careful using this mode in nested macros though, as in some cases there may be |
michael@0 | 831 | ; other copies of m# that have already been dereferenced and don't get updated correctly. |
michael@0 | 832 | %xdefine %%n1 n %+ %1 |
michael@0 | 833 | %xdefine %%n2 n %+ %2 |
michael@0 | 834 | %xdefine tmp m %+ %%n1 |
michael@0 | 835 | CAT_XDEFINE m, %%n1, m %+ %%n2 |
michael@0 | 836 | CAT_XDEFINE m, %%n2, tmp |
michael@0 | 837 | CAT_XDEFINE n, m %+ %%n1, %%n1 |
michael@0 | 838 | CAT_XDEFINE n, m %+ %%n2, %%n2 |
michael@0 | 839 | %endif |
michael@0 | 840 | %undef tmp |
michael@0 | 841 | %rotate 1 |
michael@0 | 842 | %endrep |
michael@0 | 843 | %endmacro |
michael@0 | 844 | |
michael@0 | 845 | ; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later |
michael@0 | 846 | ; calls to that function will automatically load the permutation, so values can |
michael@0 | 847 | ; be returned in mmregs. |
michael@0 | 848 | %macro SAVE_MM_PERMUTATION 0-1 |
michael@0 | 849 | %if %0 |
michael@0 | 850 | %xdefine %%f %1_m |
michael@0 | 851 | %else |
michael@0 | 852 | %xdefine %%f current_function %+ _m |
michael@0 | 853 | %endif |
michael@0 | 854 | %assign %%i 0 |
michael@0 | 855 | %rep num_mmregs |
michael@0 | 856 | CAT_XDEFINE %%f, %%i, m %+ %%i |
michael@0 | 857 | %assign %%i %%i+1 |
michael@0 | 858 | %endrep |
michael@0 | 859 | %endmacro |
michael@0 | 860 | |
michael@0 | 861 | %macro LOAD_MM_PERMUTATION 1 ; name to load from |
michael@0 | 862 | %ifdef %1_m0 |
michael@0 | 863 | %assign %%i 0 |
michael@0 | 864 | %rep num_mmregs |
michael@0 | 865 | CAT_XDEFINE m, %%i, %1_m %+ %%i |
michael@0 | 866 | CAT_XDEFINE n, m %+ %%i, %%i |
michael@0 | 867 | %assign %%i %%i+1 |
michael@0 | 868 | %endrep |
michael@0 | 869 | %endif |
michael@0 | 870 | %endmacro |
michael@0 | 871 | |
michael@0 | 872 | ; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't |
michael@0 | 873 | %macro call 1 |
michael@0 | 874 | call_internal %1, %1 %+ SUFFIX |
michael@0 | 875 | %endmacro |
michael@0 | 876 | %macro call_internal 2 |
michael@0 | 877 | %xdefine %%i %1 |
michael@0 | 878 | %ifndef cglobaled_%1 |
michael@0 | 879 | %ifdef cglobaled_%2 |
michael@0 | 880 | %xdefine %%i %2 |
michael@0 | 881 | %endif |
michael@0 | 882 | %endif |
michael@0 | 883 | call %%i |
michael@0 | 884 | LOAD_MM_PERMUTATION %%i |
michael@0 | 885 | %endmacro |
michael@0 | 886 | |
michael@0 | 887 | ; Substitutions that reduce instruction size but are functionally equivalent |
michael@0 | 888 | %macro add 2 |
michael@0 | 889 | %ifnum %2 |
michael@0 | 890 | %if %2==128 |
michael@0 | 891 | sub %1, -128 |
michael@0 | 892 | %else |
michael@0 | 893 | add %1, %2 |
michael@0 | 894 | %endif |
michael@0 | 895 | %else |
michael@0 | 896 | add %1, %2 |
michael@0 | 897 | %endif |
michael@0 | 898 | %endmacro |
michael@0 | 899 | |
michael@0 | 900 | %macro sub 2 |
michael@0 | 901 | %ifnum %2 |
michael@0 | 902 | %if %2==128 |
michael@0 | 903 | add %1, -128 |
michael@0 | 904 | %else |
michael@0 | 905 | sub %1, %2 |
michael@0 | 906 | %endif |
michael@0 | 907 | %else |
michael@0 | 908 | sub %1, %2 |
michael@0 | 909 | %endif |
michael@0 | 910 | %endmacro |
michael@0 | 911 | |
michael@0 | 912 | ;============================================================================= |
michael@0 | 913 | ; AVX abstraction layer |
michael@0 | 914 | ;============================================================================= |
michael@0 | 915 | |
michael@0 | 916 | %assign i 0 |
michael@0 | 917 | %rep 16 |
michael@0 | 918 | %if i < 8 |
michael@0 | 919 | CAT_XDEFINE sizeofmm, i, 8 |
michael@0 | 920 | %endif |
michael@0 | 921 | CAT_XDEFINE sizeofxmm, i, 16 |
michael@0 | 922 | CAT_XDEFINE sizeofymm, i, 32 |
michael@0 | 923 | %assign i i+1 |
michael@0 | 924 | %endrep |
michael@0 | 925 | %undef i |
michael@0 | 926 | |
michael@0 | 927 | ;%1 == instruction |
michael@0 | 928 | ;%2 == 1 if float, 0 if int |
michael@0 | 929 | ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm) |
michael@0 | 930 | ;%4 == number of operands given |
michael@0 | 931 | ;%5+: operands |
michael@0 | 932 | %macro RUN_AVX_INSTR 6-7+ |
michael@0 | 933 | %ifid %5 |
michael@0 | 934 | %define %%size sizeof%5 |
michael@0 | 935 | %else |
michael@0 | 936 | %define %%size mmsize |
michael@0 | 937 | %endif |
michael@0 | 938 | %if %%size==32 |
michael@0 | 939 | %if %0 >= 7 |
michael@0 | 940 | v%1 %5, %6, %7 |
michael@0 | 941 | %else |
michael@0 | 942 | v%1 %5, %6 |
michael@0 | 943 | %endif |
michael@0 | 944 | %else |
michael@0 | 945 | %if %%size==8 |
michael@0 | 946 | %define %%regmov movq |
michael@0 | 947 | %elif %2 |
michael@0 | 948 | %define %%regmov movaps |
michael@0 | 949 | %else |
michael@0 | 950 | %define %%regmov movdqa |
michael@0 | 951 | %endif |
michael@0 | 952 | |
michael@0 | 953 | %if %4>=3+%3 |
michael@0 | 954 | %ifnidn %5, %6 |
michael@0 | 955 | %if avx_enabled && sizeof%5==16 |
michael@0 | 956 | v%1 %5, %6, %7 |
michael@0 | 957 | %else |
michael@0 | 958 | %%regmov %5, %6 |
michael@0 | 959 | %1 %5, %7 |
michael@0 | 960 | %endif |
michael@0 | 961 | %else |
michael@0 | 962 | %1 %5, %7 |
michael@0 | 963 | %endif |
michael@0 | 964 | %elif %3 |
michael@0 | 965 | %1 %5, %6, %7 |
michael@0 | 966 | %else |
michael@0 | 967 | %1 %5, %6 |
michael@0 | 968 | %endif |
michael@0 | 969 | %endif |
michael@0 | 970 | %endmacro |
michael@0 | 971 | |
michael@0 | 972 | ; 3arg AVX ops with a memory arg can only have it in src2, |
michael@0 | 973 | ; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov). |
michael@0 | 974 | ; So, if the op is symmetric and the wrong one is memory, swap them. |
michael@0 | 975 | %macro RUN_AVX_INSTR1 8 |
michael@0 | 976 | %assign %%swap 0 |
michael@0 | 977 | %if avx_enabled |
michael@0 | 978 | %ifnid %6 |
michael@0 | 979 | %assign %%swap 1 |
michael@0 | 980 | %endif |
michael@0 | 981 | %elifnidn %5, %6 |
michael@0 | 982 | %ifnid %7 |
michael@0 | 983 | %assign %%swap 1 |
michael@0 | 984 | %endif |
michael@0 | 985 | %endif |
michael@0 | 986 | %if %%swap && %3 == 0 && %8 == 1 |
michael@0 | 987 | RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6 |
michael@0 | 988 | %else |
michael@0 | 989 | RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7 |
michael@0 | 990 | %endif |
michael@0 | 991 | %endmacro |
michael@0 | 992 | |
michael@0 | 993 | ;%1 == instruction |
michael@0 | 994 | ;%2 == 1 if float, 0 if int |
michael@0 | 995 | ;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 3-operand (xmm, xmm, xmm) |
michael@0 | 996 | ;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not |
michael@0 | 997 | %macro AVX_INSTR 4 |
michael@0 | 998 | %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4 |
michael@0 | 999 | %ifidn %3, fnord |
michael@0 | 1000 | RUN_AVX_INSTR %6, %7, %8, 2, %1, %2 |
michael@0 | 1001 | %elifidn %4, fnord |
michael@0 | 1002 | RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9 |
michael@0 | 1003 | %elifidn %5, fnord |
michael@0 | 1004 | RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4 |
michael@0 | 1005 | %else |
michael@0 | 1006 | RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5 |
michael@0 | 1007 | %endif |
michael@0 | 1008 | %endmacro |
michael@0 | 1009 | %endmacro |
michael@0 | 1010 | |
michael@0 | 1011 | AVX_INSTR addpd, 1, 0, 1 |
michael@0 | 1012 | AVX_INSTR addps, 1, 0, 1 |
michael@0 | 1013 | AVX_INSTR addsd, 1, 0, 1 |
michael@0 | 1014 | AVX_INSTR addss, 1, 0, 1 |
michael@0 | 1015 | AVX_INSTR addsubpd, 1, 0, 0 |
michael@0 | 1016 | AVX_INSTR addsubps, 1, 0, 0 |
michael@0 | 1017 | AVX_INSTR andpd, 1, 0, 1 |
michael@0 | 1018 | AVX_INSTR andps, 1, 0, 1 |
michael@0 | 1019 | AVX_INSTR andnpd, 1, 0, 0 |
michael@0 | 1020 | AVX_INSTR andnps, 1, 0, 0 |
michael@0 | 1021 | AVX_INSTR blendpd, 1, 0, 0 |
michael@0 | 1022 | AVX_INSTR blendps, 1, 0, 0 |
michael@0 | 1023 | AVX_INSTR blendvpd, 1, 0, 0 |
michael@0 | 1024 | AVX_INSTR blendvps, 1, 0, 0 |
michael@0 | 1025 | AVX_INSTR cmppd, 1, 0, 0 |
michael@0 | 1026 | AVX_INSTR cmpps, 1, 0, 0 |
michael@0 | 1027 | AVX_INSTR cmpsd, 1, 0, 0 |
michael@0 | 1028 | AVX_INSTR cmpss, 1, 0, 0 |
michael@0 | 1029 | AVX_INSTR cvtdq2ps, 1, 0, 0 |
michael@0 | 1030 | AVX_INSTR cvtps2dq, 1, 0, 0 |
michael@0 | 1031 | AVX_INSTR divpd, 1, 0, 0 |
michael@0 | 1032 | AVX_INSTR divps, 1, 0, 0 |
michael@0 | 1033 | AVX_INSTR divsd, 1, 0, 0 |
michael@0 | 1034 | AVX_INSTR divss, 1, 0, 0 |
michael@0 | 1035 | AVX_INSTR dppd, 1, 1, 0 |
michael@0 | 1036 | AVX_INSTR dpps, 1, 1, 0 |
michael@0 | 1037 | AVX_INSTR haddpd, 1, 0, 0 |
michael@0 | 1038 | AVX_INSTR haddps, 1, 0, 0 |
michael@0 | 1039 | AVX_INSTR hsubpd, 1, 0, 0 |
michael@0 | 1040 | AVX_INSTR hsubps, 1, 0, 0 |
michael@0 | 1041 | AVX_INSTR maxpd, 1, 0, 1 |
michael@0 | 1042 | AVX_INSTR maxps, 1, 0, 1 |
michael@0 | 1043 | AVX_INSTR maxsd, 1, 0, 1 |
michael@0 | 1044 | AVX_INSTR maxss, 1, 0, 1 |
michael@0 | 1045 | AVX_INSTR minpd, 1, 0, 1 |
michael@0 | 1046 | AVX_INSTR minps, 1, 0, 1 |
michael@0 | 1047 | AVX_INSTR minsd, 1, 0, 1 |
michael@0 | 1048 | AVX_INSTR minss, 1, 0, 1 |
michael@0 | 1049 | AVX_INSTR movhlps, 1, 0, 0 |
michael@0 | 1050 | AVX_INSTR movlhps, 1, 0, 0 |
michael@0 | 1051 | AVX_INSTR movsd, 1, 0, 0 |
michael@0 | 1052 | AVX_INSTR movss, 1, 0, 0 |
michael@0 | 1053 | AVX_INSTR mpsadbw, 0, 1, 0 |
michael@0 | 1054 | AVX_INSTR mulpd, 1, 0, 1 |
michael@0 | 1055 | AVX_INSTR mulps, 1, 0, 1 |
michael@0 | 1056 | AVX_INSTR mulsd, 1, 0, 1 |
michael@0 | 1057 | AVX_INSTR mulss, 1, 0, 1 |
michael@0 | 1058 | AVX_INSTR orpd, 1, 0, 1 |
michael@0 | 1059 | AVX_INSTR orps, 1, 0, 1 |
michael@0 | 1060 | AVX_INSTR packsswb, 0, 0, 0 |
michael@0 | 1061 | AVX_INSTR packssdw, 0, 0, 0 |
michael@0 | 1062 | AVX_INSTR packuswb, 0, 0, 0 |
michael@0 | 1063 | AVX_INSTR packusdw, 0, 0, 0 |
michael@0 | 1064 | AVX_INSTR paddb, 0, 0, 1 |
michael@0 | 1065 | AVX_INSTR paddw, 0, 0, 1 |
michael@0 | 1066 | AVX_INSTR paddd, 0, 0, 1 |
michael@0 | 1067 | AVX_INSTR paddq, 0, 0, 1 |
michael@0 | 1068 | AVX_INSTR paddsb, 0, 0, 1 |
michael@0 | 1069 | AVX_INSTR paddsw, 0, 0, 1 |
michael@0 | 1070 | AVX_INSTR paddusb, 0, 0, 1 |
michael@0 | 1071 | AVX_INSTR paddusw, 0, 0, 1 |
michael@0 | 1072 | AVX_INSTR palignr, 0, 1, 0 |
michael@0 | 1073 | AVX_INSTR pand, 0, 0, 1 |
michael@0 | 1074 | AVX_INSTR pandn, 0, 0, 0 |
michael@0 | 1075 | AVX_INSTR pavgb, 0, 0, 1 |
michael@0 | 1076 | AVX_INSTR pavgw, 0, 0, 1 |
michael@0 | 1077 | AVX_INSTR pblendvb, 0, 0, 0 |
michael@0 | 1078 | AVX_INSTR pblendw, 0, 1, 0 |
michael@0 | 1079 | AVX_INSTR pcmpestri, 0, 0, 0 |
michael@0 | 1080 | AVX_INSTR pcmpestrm, 0, 0, 0 |
michael@0 | 1081 | AVX_INSTR pcmpistri, 0, 0, 0 |
michael@0 | 1082 | AVX_INSTR pcmpistrm, 0, 0, 0 |
michael@0 | 1083 | AVX_INSTR pcmpeqb, 0, 0, 1 |
michael@0 | 1084 | AVX_INSTR pcmpeqw, 0, 0, 1 |
michael@0 | 1085 | AVX_INSTR pcmpeqd, 0, 0, 1 |
michael@0 | 1086 | AVX_INSTR pcmpeqq, 0, 0, 1 |
michael@0 | 1087 | AVX_INSTR pcmpgtb, 0, 0, 0 |
michael@0 | 1088 | AVX_INSTR pcmpgtw, 0, 0, 0 |
michael@0 | 1089 | AVX_INSTR pcmpgtd, 0, 0, 0 |
michael@0 | 1090 | AVX_INSTR pcmpgtq, 0, 0, 0 |
michael@0 | 1091 | AVX_INSTR phaddw, 0, 0, 0 |
michael@0 | 1092 | AVX_INSTR phaddd, 0, 0, 0 |
michael@0 | 1093 | AVX_INSTR phaddsw, 0, 0, 0 |
michael@0 | 1094 | AVX_INSTR phsubw, 0, 0, 0 |
michael@0 | 1095 | AVX_INSTR phsubd, 0, 0, 0 |
michael@0 | 1096 | AVX_INSTR phsubsw, 0, 0, 0 |
michael@0 | 1097 | AVX_INSTR pmaddwd, 0, 0, 1 |
michael@0 | 1098 | AVX_INSTR pmaddubsw, 0, 0, 0 |
michael@0 | 1099 | AVX_INSTR pmaxsb, 0, 0, 1 |
michael@0 | 1100 | AVX_INSTR pmaxsw, 0, 0, 1 |
michael@0 | 1101 | AVX_INSTR pmaxsd, 0, 0, 1 |
michael@0 | 1102 | AVX_INSTR pmaxub, 0, 0, 1 |
michael@0 | 1103 | AVX_INSTR pmaxuw, 0, 0, 1 |
michael@0 | 1104 | AVX_INSTR pmaxud, 0, 0, 1 |
michael@0 | 1105 | AVX_INSTR pminsb, 0, 0, 1 |
michael@0 | 1106 | AVX_INSTR pminsw, 0, 0, 1 |
michael@0 | 1107 | AVX_INSTR pminsd, 0, 0, 1 |
michael@0 | 1108 | AVX_INSTR pminub, 0, 0, 1 |
michael@0 | 1109 | AVX_INSTR pminuw, 0, 0, 1 |
michael@0 | 1110 | AVX_INSTR pminud, 0, 0, 1 |
michael@0 | 1111 | AVX_INSTR pmulhuw, 0, 0, 1 |
michael@0 | 1112 | AVX_INSTR pmulhrsw, 0, 0, 1 |
michael@0 | 1113 | AVX_INSTR pmulhw, 0, 0, 1 |
michael@0 | 1114 | AVX_INSTR pmullw, 0, 0, 1 |
michael@0 | 1115 | AVX_INSTR pmulld, 0, 0, 1 |
michael@0 | 1116 | AVX_INSTR pmuludq, 0, 0, 1 |
michael@0 | 1117 | AVX_INSTR pmuldq, 0, 0, 1 |
michael@0 | 1118 | AVX_INSTR por, 0, 0, 1 |
michael@0 | 1119 | AVX_INSTR psadbw, 0, 0, 1 |
michael@0 | 1120 | AVX_INSTR pshufb, 0, 0, 0 |
michael@0 | 1121 | AVX_INSTR psignb, 0, 0, 0 |
michael@0 | 1122 | AVX_INSTR psignw, 0, 0, 0 |
michael@0 | 1123 | AVX_INSTR psignd, 0, 0, 0 |
michael@0 | 1124 | AVX_INSTR psllw, 0, 0, 0 |
michael@0 | 1125 | AVX_INSTR pslld, 0, 0, 0 |
michael@0 | 1126 | AVX_INSTR psllq, 0, 0, 0 |
michael@0 | 1127 | AVX_INSTR pslldq, 0, 0, 0 |
michael@0 | 1128 | AVX_INSTR psraw, 0, 0, 0 |
michael@0 | 1129 | AVX_INSTR psrad, 0, 0, 0 |
michael@0 | 1130 | AVX_INSTR psrlw, 0, 0, 0 |
michael@0 | 1131 | AVX_INSTR psrld, 0, 0, 0 |
michael@0 | 1132 | AVX_INSTR psrlq, 0, 0, 0 |
michael@0 | 1133 | AVX_INSTR psrldq, 0, 0, 0 |
michael@0 | 1134 | AVX_INSTR psubb, 0, 0, 0 |
michael@0 | 1135 | AVX_INSTR psubw, 0, 0, 0 |
michael@0 | 1136 | AVX_INSTR psubd, 0, 0, 0 |
michael@0 | 1137 | AVX_INSTR psubq, 0, 0, 0 |
michael@0 | 1138 | AVX_INSTR psubsb, 0, 0, 0 |
michael@0 | 1139 | AVX_INSTR psubsw, 0, 0, 0 |
michael@0 | 1140 | AVX_INSTR psubusb, 0, 0, 0 |
michael@0 | 1141 | AVX_INSTR psubusw, 0, 0, 0 |
michael@0 | 1142 | AVX_INSTR punpckhbw, 0, 0, 0 |
michael@0 | 1143 | AVX_INSTR punpckhwd, 0, 0, 0 |
michael@0 | 1144 | AVX_INSTR punpckhdq, 0, 0, 0 |
michael@0 | 1145 | AVX_INSTR punpckhqdq, 0, 0, 0 |
michael@0 | 1146 | AVX_INSTR punpcklbw, 0, 0, 0 |
michael@0 | 1147 | AVX_INSTR punpcklwd, 0, 0, 0 |
michael@0 | 1148 | AVX_INSTR punpckldq, 0, 0, 0 |
michael@0 | 1149 | AVX_INSTR punpcklqdq, 0, 0, 0 |
michael@0 | 1150 | AVX_INSTR pxor, 0, 0, 1 |
michael@0 | 1151 | AVX_INSTR shufps, 1, 1, 0 |
michael@0 | 1152 | AVX_INSTR subpd, 1, 0, 0 |
michael@0 | 1153 | AVX_INSTR subps, 1, 0, 0 |
michael@0 | 1154 | AVX_INSTR subsd, 1, 0, 0 |
michael@0 | 1155 | AVX_INSTR subss, 1, 0, 0 |
michael@0 | 1156 | AVX_INSTR unpckhpd, 1, 0, 0 |
michael@0 | 1157 | AVX_INSTR unpckhps, 1, 0, 0 |
michael@0 | 1158 | AVX_INSTR unpcklpd, 1, 0, 0 |
michael@0 | 1159 | AVX_INSTR unpcklps, 1, 0, 0 |
michael@0 | 1160 | AVX_INSTR xorpd, 1, 0, 1 |
michael@0 | 1161 | AVX_INSTR xorps, 1, 0, 1 |
michael@0 | 1162 | |
michael@0 | 1163 | ; 3DNow instructions, for sharing code between AVX, SSE and 3DN |
michael@0 | 1164 | AVX_INSTR pfadd, 1, 0, 1 |
michael@0 | 1165 | AVX_INSTR pfsub, 1, 0, 0 |
michael@0 | 1166 | AVX_INSTR pfmul, 1, 0, 1 |
michael@0 | 1167 | |
michael@0 | 1168 | ; base-4 constants for shuffles |
michael@0 | 1169 | %assign i 0 |
michael@0 | 1170 | %rep 256 |
michael@0 | 1171 | %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3) |
michael@0 | 1172 | %if j < 10 |
michael@0 | 1173 | CAT_XDEFINE q000, j, i |
michael@0 | 1174 | %elif j < 100 |
michael@0 | 1175 | CAT_XDEFINE q00, j, i |
michael@0 | 1176 | %elif j < 1000 |
michael@0 | 1177 | CAT_XDEFINE q0, j, i |
michael@0 | 1178 | %else |
michael@0 | 1179 | CAT_XDEFINE q, j, i |
michael@0 | 1180 | %endif |
michael@0 | 1181 | %assign i i+1 |
michael@0 | 1182 | %endrep |
michael@0 | 1183 | %undef i |
michael@0 | 1184 | %undef j |
michael@0 | 1185 | |
michael@0 | 1186 | %macro FMA_INSTR 3 |
michael@0 | 1187 | %macro %1 4-7 %1, %2, %3 |
michael@0 | 1188 | %if cpuflag(xop) |
michael@0 | 1189 | v%5 %1, %2, %3, %4 |
michael@0 | 1190 | %else |
michael@0 | 1191 | %6 %1, %2, %3 |
michael@0 | 1192 | %7 %1, %4 |
michael@0 | 1193 | %endif |
michael@0 | 1194 | %endmacro |
michael@0 | 1195 | %endmacro |
michael@0 | 1196 | |
michael@0 | 1197 | FMA_INSTR pmacsdd, pmulld, paddd |
michael@0 | 1198 | FMA_INSTR pmacsww, pmullw, paddw |
michael@0 | 1199 | FMA_INSTR pmadcswd, pmaddwd, paddd |